
# IoT-based Multimodal Pipeline for Early Mastitis Detection
This notebook provides a robust, leak-safe and energy-aware pipeline:
- **Tabular model** on clinical ground truth (CSV)
- **Imaging model** (frozen EfficientNet features + LR) on image labels
- **Cross-modal bridge** (tab→image embeddings) enabling fusion even when cohorts are not perfectly aligned
- **Fail-safe fusion** with proper clinical evaluation


In [1]:
# ===== 1) Configuration & Paths =====
# Goal: centralise environment detection (Colab vs local), pick robust paths to data (images/COCO/tabular),
# and define runtime knobs (seeds, batch size, GPU toggle). This avoids path drift and makes runs reproducible.

import os, random, json, re, glob, math, shutil, time, warnings
from pathlib import Path
import numpy as np
import pandas as pd

# --- Detect whether we are in Google Colab ----------------------------------
# Why: On Colab, data usually lives in Google Drive; locally, we use the current working directory.
try:
    from google.colab import drive  # type: ignore
    IN_COLAB = True
except Exception:
    IN_COLAB = False

if IN_COLAB:
    # Mount Google Drive once. If already mounted, this is a no-op.
    try:
        drive.mount('/content/drive', force_remount=False)
    except Exception:
        pass
    BASE_DRIVE = Path("/content/drive/MyDrive")
else:
    # Fallback: run relative to the current working directory.
    BASE_DRIVE = Path(os.getcwd())

# --- Helper to choose the first existing path from a list --------------------
# Why: Users may keep data in slightly different locations; we accept multiple “candidates” and pick the first that exists.
def first_existing(paths):
    for p in paths:
        p = Path(p)
        if p.exists():
            return p
    return Path("")

# --- Project roots & dataset containers --------------------------------------
# Why: Normalise the notion of "project home" and "datasets" so downstream code can be path-agnostic.
PROJECT_ROOT_CANDIDATES = [
    BASE_DRIVE / "Mastitis_illness_cow",
    BASE_DRIVE / "mastitis_illness_cow",
    Path("/mnt/data/Mastitis_illness_cow"),
    Path(os.getcwd()) / "Mastitis_illness_cow",
]
PROJECT_ROOT = first_existing(PROJECT_ROOT_CANDIDATES) or PROJECT_ROOT_CANDIDATES[0]

DATASETS_DIR = first_existing([
    PROJECT_ROOT / "datasets",   # your primary layout
    PROJECT_ROOT / "data",
    Path("/mnt/data")
]) or (PROJECT_ROOT / "datasets")

# --- Canonical paths for images, COCO annotations, and tabular CSV ----------
# Why: We avoid hard failures here; we only warn. Hard assertions will happen in the loading cell.
IMAGE_DIR = first_existing([
    DATASETS_DIR / "images",                # e.g., /.../datasets/images  (typical)
    PROJECT_ROOT / "images",
])
COCO_JSON_PATH = first_existing([
    PROJECT_ROOT / "exports" / "_annotations.coco.json",  # e.g., /.../exports/_annotations.coco.json
    DATASETS_DIR / "_annotations.coco.json",
    Path("/mnt/data/_annotations.coco.json"),
])
TABULAR_CSV_PATH = first_existing([
    DATASETS_DIR / "clinical_mastitis_cows_version1.csv", # e.g., /.../datasets/clinical_mastitis_cows_version1.csv
    Path("/mnt/data/clinical_mastitis_cows_version1.csv"),
])
# Optional explicit mapping file: filename -> Cow_ID (useful when image folders do not encode cow IDs)
MAPPING_CSV_PATH = first_existing([
    DATASETS_DIR / "image_to_cow_map.csv",
])

# Legacy label dir (unused with COCO; we keep it printed for visibility when migrating notebooks)
LABEL_DIR = PROJECT_ROOT / "labels"

# --- Runtime knobs: GPU, batch size, seeds -----------------------------------
USE_GPU_FOR_IMAGE_MODEL = True
BATCH_SIZE_IMAGE = 32

SEED = 42
random.seed(SEED); np.random.seed(SEED)

def seed_all_torch(seed=42):
    """
    Set PyTorch seeds and deterministic flags if torch is available.
    Rationale: helps reproducibility even on CuDNN-enabled setups.
    """
    try:
        import torch
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    except Exception:
        # Torch not installed or GPU not present; silently ignore.
        pass

seed_all_torch(SEED)

# --- Human-friendly path diagnostics (no hard stop here) ---------------------
def exists_str(p: Path):
    return f"{str(p)}  [{'OK' if p and p.exists() else 'MISSING'}]"

print("IN_COLAB:", IN_COLAB)
print("BASE_DRIVE:", BASE_DRIVE)
print("PROJECT_ROOT:", exists_str(PROJECT_ROOT))
print("DATASETS_DIR:", exists_str(DATASETS_DIR))
print("IMAGE_DIR:", exists_str(IMAGE_DIR))
print("COCO_JSON_PATH:", exists_str(COCO_JSON_PATH))
print("TABULAR_CSV_PATH:", exists_str(TABULAR_CSV_PATH))
print("MAPPING_CSV_PATH (optional):", exists_str(MAPPING_CSV_PATH))
print("LABEL_DIR (legacy, unused with COCO):", exists_str(LABEL_DIR))

# Gentle warnings to catch misconfigurations early (we assert later in the load cell).
if not IMAGE_DIR or not IMAGE_DIR.exists():
    warnings.warn("IMAGE_DIR not found. Example: /content/drive/MyDrive/Mastitis_illness_cow/datasets/images")
if not COCO_JSON_PATH or not COCO_JSON_PATH.exists():
    warnings.warn("COCO JSON not found. Expected: '.../exports/_annotations.coco.json'.")
if not TABULAR_CSV_PATH or not TABULAR_CSV_PATH.exists():
    warnings.warn("Tabular CSV not found. Expected: '.../datasets/clinical_mastitis_cows_version1.csv'.")


Mounted at /content/drive
IN_COLAB: True
BASE_DRIVE: /content/drive/MyDrive
PROJECT_ROOT: /content/drive/MyDrive/Mastitis_illness_cow  [OK]
DATASETS_DIR: /content/drive/MyDrive/Mastitis_illness_cow/datasets  [OK]
IMAGE_DIR: /content/drive/MyDrive/Mastitis_illness_cow/datasets/images  [OK]
COCO_JSON_PATH: /content/drive/MyDrive/Mastitis_illness_cow/exports/_annotations.coco.json  [OK]
TABULAR_CSV_PATH: /content/drive/MyDrive/Mastitis_illness_cow/datasets/clinical_mastitis_cows_version1.csv  [OK]
MAPPING_CSV_PATH (optional): /content/drive/MyDrive/Mastitis_illness_cow/datasets/image_to_cow_map.csv  [OK]
LABEL_DIR (legacy, unused with COCO): /content/drive/MyDrive/Mastitis_illness_cow/labels  [MISSING]


In [2]:
# ===== 2) ADAPTIVE TASK: RISK_NEXT (visits) → RISK_WITHIN (days) → fallback PROXIMITY =====
# Goal: derive a robust visit-level target ('risk_next') for early diagnosis modeling.
# Why adaptive? On small, noisy, or irregular time series, a single definition may be too sparse or too dense.
# Strategy:
#   (1) Try a visit-based lookahead: "positive if any of the next K visits is positive".
#   (2) If that’s too sparse, try a time-based window: "positive if an onset occurs within H days".
#   (3) If both fail thresholds, fall back to a proximity proxy (mark last K visits before first onset, including onset).
# We also enforce leak-safe splits by cow so the same animal never leaks across train/val/test.

import os, re, glob
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

# --- Normalise cow IDs to a canonical form ('cow{N}') ------------------------
# Rationale: imaging/fusion later expect consistent identifiers. This absorbs 'Cow_01', 'COW-003', 'cow12' → 'cow1'/'cow3'/'cow12'.
def normalize_cow_id(x) -> str:
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    s = str(x).strip().lower()
    m = re.search(r"cow[_\-]?(\d+)", s)
    if m:
        return f"cow{int(m.group(1))}"
    # If no 'cow' token present, map bare digits to cowN as a fallback
    m2 = re.search(r"\b(\d{1,6})\b", s)
    if m2:
        return f"cow{int(m2.group(1))}"
    return s

# --- Ensure binary label semantics (0/1) -------------------------------------
# Rationale: table labels may be {0,1}, {True,False}, or strings like 'yes'/'no'.
def ensure_binary(arr, pos_values=(1, "1", True, "true", "yes", "y")):
    y = pd.to_numeric(arr, errors="coerce")
    if y.isna().all():
        # Map strings to binary if numerics fail entirely
        y = arr.astype(str).str.lower().isin({str(v).lower() for v in pos_values}).astype(int)
    else:
        y = (y.fillna(0).astype(float) > 0).astype(int)
    return y

# --- Task aggressiveness thresholds ------------------------------------------
# Why: We prefer a target with enough positives to be learnable but not trivial.
K_LIST   = [3, 5, 7, 10, 14, 21, 30]   # lookahead in number of future visits
H_LIST   = [3, 5, 7, 10, 14, 21, 30]   # lookahead horizon in days
MIN_POS_VISITS = 50                    # minimum positive visits required
MIN_POS_COWS   = 30                    # minimum cows with ≥1 positive visit

# --- Load the tabular dataset as configured in Cell 1 ------------------------
assert 'TABULAR_CSV_PATH' in globals(), "TABULAR_CSV_PATH is not defined (check Cell 1)."
TABULAR_CSV_PATH = Path(TABULAR_CSV_PATH)
if not TABULAR_CSV_PATH.exists():
    raise FileNotFoundError(f"Tabular CSV not found at: {TABULAR_CSV_PATH}")

tab = pd.read_csv(TABULAR_CSV_PATH)
print("[Tabular] shape:", tab.shape)
print("[Tabular] Columns:", list(tab.columns))

# --- Auto-detect key columns (ID, time, label) --------------------------------
# Why: supports light schema drift while keeping the notebook reusable.
COW_ID_COL = next((c for c in ["Cow_ID","cow_id","CowID","animal_id","Animal_ID","subject_id","id","ID"] if c in tab.columns), None)
TIME_COL   = next((c for c in ["Day","day","time","Time","Days","days"] if c in tab.columns), None)
RAW_TARGET = next((c for c in ["class1","Class1","Label","label","mastitis","status","target","class","disease","outcome","y"] if c in tab.columns), None)
if COW_ID_COL is None: raise KeyError("Cow ID column not found. Expected one of: Cow_ID, cow_id, CowID, animal_id, ...")
if TIME_COL   is None: raise KeyError("Time column not found (e.g., 'Day').")
if RAW_TARGET is None: raise KeyError("Binary target column not found (e.g., 'class1').")

# --- Normalise identifiers and coerce types ----------------------------------
tab = tab.copy()
tab["Cow_ID_match"] = tab[COW_ID_COL].apply(normalize_cow_id)  # canonical cow id
KEY = "Cow_ID_match"

tab[TIME_COL]   = pd.to_numeric(tab[TIME_COL], errors="coerce")
tab[RAW_TARGET] = ensure_binary(tab[RAW_TARGET])
tab = tab[tab[KEY] != ""].copy().sort_values([KEY, TIME_COL]).reset_index(drop=True)

print("[CowID] unique:", tab[KEY].nunique())
print("[Target] positives:", int(tab[RAW_TARGET].sum()), "| negatives:", int((1 - tab[RAW_TARGET]).sum()))

# --- Label builders: three options with complementary assumptions ------------
def build_risk_nextK_visits(df, K=3, key=KEY, tcol=TIME_COL, ycol=RAW_TARGET):
    """
    Visit-based risk: mark a visit as 1 if any of the next K visits is positive.
    We exclude visits that are already positive at time t (set to -1 and later filtered out).
    Why: aligns with clinical “what happens next” framing for discrete schedules.
    """
    out = []
    for cow, g in df.groupby(key, sort=False):
        g = g.sort_values(tcol, na_position="last").reset_index(drop=True)
        y = g[ycol].to_numpy(dtype=int)
        n = len(g)
        rn = np.full(n, -1, dtype=int)
        for t in range(n):
            if y[t] == 1:
                rn[t] = -1          # exclude current-onset visits from inputs
            else:
                t2 = min(n-1, t + K)
                rn[t] = int((y[t+1:t2+1] == 1).any()) if t < n-1 else 0
        g2 = g.copy(); g2["risk_next"] = rn
        out.append(g2)
    out = pd.concat(out, ignore_index=True)
    out = out.query("risk_next != -1").copy()
    out["risk_next"] = out["risk_next"].astype(int)
    return out

def build_risk_withinH_days(df, H=7, key=KEY, tcol=TIME_COL, ycol=RAW_TARGET):
    """
    Time-based risk: mark a visit as 1 if a positive event occurs within H days after this visit.
    We exclude visits already positive at time t.
    Why: respects irregular sampling in days rather than visit counts.
    """
    out = []
    for cow, g in df.groupby(key, sort=False):
        g = g.sort_values(tcol, na_position="last").reset_index(drop=True)
        y = g[ycol].to_numpy(dtype=int)
        tvals = g[tcol].to_numpy(dtype=float)
        n = len(g)
        rn = np.zeros(n, dtype=int)
        for t in range(n):
            if y[t] == 1:
                rn[t] = -1
            else:
                future_idx = np.where((tvals > tvals[t]) & (tvals - tvals[t] <= H))[0]
                rn[t] = int(any(y[j] == 1 for j in future_idx))
        g2 = g.copy(); g2["risk_next"] = rn
        out.append(g2)
    out = pd.concat(out, ignore_index=True)
    out = out.query("risk_next != -1").copy()
    out["risk_next"] = out["risk_next"].astype(int)
    return out

def build_proximity_visit_level(df, K=3, key=KEY, tcol=TIME_COL, ycol=RAW_TARGET):
    """
    Proximity proxy: mark as 1 the last K visits before the first onset, including the onset itself.
    Why: if genuine forecasting windows are too sparse, this yields a learnable pre-onset signal.
    """
    out = []
    for cow, g in df.groupby(key, sort=False):
        g = g.sort_values(tcol, na_position="last").reset_index(drop=True)
        y = g[ycol].to_numpy(dtype=int)
        n = len(g)
        rn = np.zeros(n, dtype=int)
        pos = np.where(y == 1)[0]
        if len(pos) > 0:
            i0 = int(pos[0])                # first onset index
            j0 = max(0, i0 - (K - 1))       # include K-1 visits before onset
            rn[j0:i0+1] = 1
        g2 = g.copy(); g2["risk_next"] = rn
        out.append(g2)
    out = pd.concat(out, ignore_index=True)
    out["risk_next"] = out["risk_next"].astype(int)
    return out

# --- Adaptive selection: try visit-based, then time-based, else proximity ----
chosen = None

# (1) Next-K visits
for K in K_LIST:
    cand = build_risk_nextK_visits(tab, K=K, key=KEY, tcol=TIME_COL, ycol=RAW_TARGET)
    pos_v = int((cand["risk_next"] == 1).sum())
    pos_c = int(cand.groupby(KEY)["risk_next"].max().sum())
    print(f"[TRY] RISK_NEXT@{K}visits | positive visits={pos_v} | positive cows={pos_c}")
    if pos_v >= MIN_POS_VISITS and pos_c >= MIN_POS_COWS:
        chosen = ("RISK_NEXT_visits", K, cand); break

# (2) Within-H days
if chosen is None:
    for H in H_LIST:
        cand = build_risk_withinH_days(tab, H=H, key=KEY, tcol=TIME_COL, ycol=RAW_TARGET)
        pos_v = int((cand["risk_next"] == 1).sum())
        pos_c = int(cand.groupby(KEY)["risk_next"].max().sum())
        print(f"[TRY] RISK_WITHIN@{H}days | positive visits={pos_v} | positive cows={pos_c}")
        if pos_v >= MIN_POS_VISITS and pos_c >= MIN_POS_COWS:
            chosen = ("RISK_WITHIN_days", H, cand); break

# (3) Fallback: proximity around onset
if chosen is None:
    K_fallback = 3
    cand = build_proximity_visit_level(tab, K=K_fallback, key=KEY, tcol=TIME_COL, ycol=RAW_TARGET)
    pos_v = int((cand["risk_next"] == 1).sum())
    pos_c = int(cand.groupby(KEY)["risk_next"].max().sum())
    print(f"[FALLBACK] PROXIMITY@{K_fallback}vis | positive visits={pos_v} | positive cows={pos_c}")
    chosen = ("PROXIMITY_visits", K_fallback, cand)

TASK_MODE, HYPER, df_risk = chosen
print(f"[CHOSEN] {TASK_MODE} param={HYPER} | visits pos={int((df_risk['risk_next']==1).sum())} "
      f"| cows pos={int(df_risk.groupby(KEY)['risk_next'].max().sum())} | N={len(df_risk)}")

# --- Leak-safe split by cow: no animal appears in multiple splits ------------
# Why: prevents identity leakage; ensures generalisation across animals rather than across visits from the same cow.
cow_any = df_risk.groupby(KEY)["risk_next"].max().astype(int)
all_cows = np.array(sorted(cow_any.index.astype(str)))
y_cows   = cow_any.reindex(all_cows).values

if len(np.unique(y_cows)) < 2:
    # If stratification is impossible (single class across cows), use plain split.
    print("[WARN] Single-class across cows. Falling back to non-stratified split.")
    tr_c, te_c = train_test_split(all_cows, test_size=0.20, random_state=42)
else:
    tr_c, te_c = train_test_split(all_cows, test_size=0.20, stratify=y_cows, random_state=42)

mask_tv = np.isin(all_cows, tr_c)
tv_cows = all_cows[mask_tv]
tv_y    = cow_any.reindex(tv_cows).values

if len(np.unique(tv_y)) < 2:
    tr_cows, val_cows = train_test_split(tv_cows, test_size=0.25, random_state=42)
else:
    tr_cows, val_cows = train_test_split(tv_cows, test_size=0.25, stratify=tv_y, random_state=42)

print(f"[Split] Train cows: {len(tr_cows)} | Val cows: {len(val_cows)} | Test cows: {len(te_c)}")
print(f"[READY] TASK_MODE='{TASK_MODE}' | label='risk_next' | hyper={HYPER}")


[Tabular] shape: (6600, 18)
[Tabular] Columns: ['Cow_ID', 'Day', 'Breed', 'Months after giving birth', 'Previous_Mastits_status', 'IUFL', 'EUFL', 'IUFR', 'EUFR', 'IURL', 'EURL', 'IURR', 'EURR', 'Temperature', 'Hardness', 'Pain', 'Milk_visibility', 'class1']
[CowID] unique: 1100
[Target] positives: 1110 | negatives: 5490
[TRY] RISK_NEXT@3visits | positive visits=0 | positive cows=0
[TRY] RISK_NEXT@5visits | positive visits=0 | positive cows=0
[TRY] RISK_NEXT@7visits | positive visits=0 | positive cows=0
[TRY] RISK_NEXT@10visits | positive visits=0 | positive cows=0
[TRY] RISK_NEXT@14visits | positive visits=0 | positive cows=0
[TRY] RISK_NEXT@21visits | positive visits=0 | positive cows=0
[TRY] RISK_NEXT@30visits | positive visits=0 | positive cows=0
[TRY] RISK_WITHIN@3days | positive visits=0 | positive cows=0
[TRY] RISK_WITHIN@5days | positive visits=0 | positive cows=0
[TRY] RISK_WITHIN@7days | positive visits=0 | positive cows=0
[TRY] RISK_WITHIN@10days | positive visits=0 | positiv

In [3]:
# ===== 2.5) Visit-level feature engineering (leak-safe) & cow-aligned splits =====
# Goal: build leakage-safe visit-level features with cows strictly disjoint across splits.
# Why: ensures fair generalisation (no identity leakage) and stable training.
# Design notes:
#   • We derive features *within each split* only (no cross-split stats).
#   • We keep a single, clean key (KEY) and a single, integer target (YCOL).
#   • We handle both "Day" and custom time columns via TIME_COL from Cell 2.
#   • We drop degenerate (all-NaN / zero-variance) features based on TRAIN only.
#   • We optionally keep only the K most recent visits per cow to focus on near-term risk.

import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split

# ---------- Preconditions & context ------------------------------------------
assert 'tab' in globals(), "Run Cell 2 first: the DataFrame 'tab' is missing."
assert 'TIME_COL' in globals() and isinstance(TIME_COL, str), "TIME_COL must be defined in Cell 2."
KEY = "Cow_ID_match"

# Target detection with graceful fallback
YCOL_CANDIDATES = ["risk_next", "early", "class1", "Label", "label"]
YCOL = next((c for c in YCOL_CANDIDATES if c in tab.columns), None)
if YCOL is None:
    raise KeyError(f"No target column found in 'tab'. Expected one of: {YCOL_CANDIDATES}")
if KEY not in tab.columns:
    raise KeyError(f"Key '{KEY}' missing in 'tab'; please verify Cell 2.")

# ---------- Robust utilities: ensure a single key/target ---------------------
def coerce_and_dedup_target(df: pd.DataFrame, ycol: str) -> pd.DataFrame:
    """
    Ensure a single integer target column ycol exists in df.
    Rationale: merged frames or accidental duplicates can create multiple same-named columns.
    Strategy:
      - If missing → create zeros.
      - If one    → numeric cast + fillna(0) + int.
      - If many   → numeric-cast then row-wise max (OR-like).
    """
    cols = [c for c in df.columns if c == ycol]
    if len(cols) == 0:
        df[ycol] = 0
        return df
    if len(cols) == 1:
        df[ycol] = pd.to_numeric(df[ycol], errors="coerce").fillna(0).astype(int)
        return df
    comb = df[cols].apply(pd.to_numeric, errors="coerce").fillna(0).max(axis=1).astype(int)
    df = df.drop(columns=cols, errors="ignore")
    df[ycol] = comb
    return df

def coerce_and_dedup_key(df: pd.DataFrame, key: str) -> pd.DataFrame:
    """
    Ensure a single clean key column exists in df.
    Why: downstream joins/splits assume a unique cow ID; duplicates can appear after merges.
    Strategy:
      - If one column → cast to str.
      - If multiple   → take first non-NaN per row (backfill across duplicates) and keep that.
    """
    cols = [c for c in df.columns if c == key]
    if len(cols) == 0:
        raise KeyError(f"Key '{key}' absent after preprocessing.")
    if len(cols) == 1:
        df[key] = df[key].astype(str)
        return df
    tmp = (df[cols].astype(str).replace({"nan": np.nan, "None": np.nan}))
    comb = tmp.bfill(axis=1).iloc[:, 0].astype(str)
    df = df.drop(columns=cols, errors="ignore")
    df[key] = comb
    return df

# Apply initial de-duplication on 'tab'
tab = coerce_and_dedup_key(tab, KEY)
tab = coerce_and_dedup_target(tab, YCOL)

# ---------- Ensure we have cow-based splits; rebuild if missing --------------
# Why: keep cows disjoint across train/val/test; this avoids identity leakage.
if not all(k in globals() for k in ["tr_cows", "val_cows", "test_cows"]):
    print("[2.5 Fallback] Rebuilding cow-based splits…")
    cow_y = tab.groupby(KEY)[YCOL].max().astype(int)
    all_cows = np.array(sorted(cow_y.index.astype(str)))
    if cow_y.nunique() < 2:
        # No stratification possible; still enforce disjointness.
        tr_all, te_all = train_test_split(all_cows, test_size=0.20, random_state=42, shuffle=True)
    else:
        tr_all, te_all = train_test_split(
            all_cows, test_size=0.20,
            stratify=cow_y.reindex(all_cows).values, random_state=42
        )
    tv_labels = cow_y.reindex(tr_all).values
    if len(np.unique(tv_labels)) < 2:
        tr_cows, val_cows = train_test_split(tr_all, test_size=0.25, random_state=42, shuffle=True)
    else:
        tr_cows, val_cows = train_test_split(
            tr_all, test_size=0.25, stratify=tv_labels, random_state=42
        )
    test_cows = te_all
    print(f"[2.5 Fallback] Train cows: {len(tr_cows)} | Val cows: {len(val_cows)} | Test cows: {len(test_cows)}")

# ---------- Visit-ordered base copy ------------------------------------------
# We preserve visit order per cow using TIME_COL when present; otherwise we synthesise a visit index.
base = tab.copy()
if TIME_COL in base.columns:
    base = base.sort_values([KEY, TIME_COL]).reset_index(drop=True)
else:
    base = (
        base.sort_values([KEY])
            .assign(_visit_idx = base.groupby(KEY).cumcount())
            .sort_values([KEY, "_visit_idx"])
            .reset_index(drop=True)
    )

# ---------- Select numeric columns for feature engineering -------------------
# Exclusions: key, target, and known non-numeric metadata. Extend as needed for your schema.
exclude_cols = {KEY, YCOL, "Cow_ID_norm", "onset_day", "Breed", "Previous_Mastits_status"}
num_cols_all = (
    base.drop(columns=[c for c in exclude_cols if c in base.columns], errors="ignore")
        .select_dtypes(include=[np.number])
        .columns.tolist()
)
if len(num_cols_all) == 0:
    raise RuntimeError("No numeric columns available for feature engineering. Verify input schema and exclusions.")

# ---------- Helper utilities -------------------------------------------------
def split_by_cows(df, cows):
    """Return only rows whose cow id is in the provided list. Keeps splits leak-safe by construction."""
    return df[df[KEY].astype(str).isin(set(map(str, cows)))].reset_index(drop=True)

def add_time_features(df: pd.DataFrame, num_cols) -> pd.DataFrame:
    """
    Build per-cow temporal features (leak-safe) computed *within each split*:
      • lag1 for each numeric variable (captures short-term trend),
      • rolling means over 3 and 5 visits (local smoothing),
      • first differences on raw and rolling means (change signals),
      • per-cow expanding z-score (normalises by each cow’s evolving baseline).
    Missing values from shifts/rolling are set to 0 for robustness.
    """
    d = df.copy()
    d = coerce_and_dedup_key(d, KEY)

    # Visit order: prefer TIME_COL if present, else synthetic index.
    if TIME_COL in d.columns:
        d = d.sort_values([KEY, TIME_COL]).reset_index(drop=True)
    else:
        if "_visit_idx" not in d.columns:
            d["_visit_idx"] = d.groupby(KEY).cumcount()
        d = d.sort_values([KEY, "_visit_idx"]).reset_index(drop=True)

    for c in num_cols:
        grp = d.groupby(KEY)[c]

        # lag-1
        d[f"{c}_lag1"] = grp.shift(1)

        # rolling means (3, 5)
        r3 = grp.rolling(3, min_periods=1).mean().reset_index(level=0, drop=True)
        r5 = grp.rolling(5, min_periods=1).mean().reset_index(level=0, drop=True)
        d[f"{c}_r3_mean"] = r3
        d[f"{c}_r5_mean"] = r5

        # first differences (raw + rolling)
        d[f"{c}_d1"]    = grp.diff(1)
        d[f"{c}_r3_d1"] = d[f"{c}_r3_mean"].groupby(d[KEY]).diff(1)
        d[f"{c}_r5_d1"] = d[f"{c}_r5_mean"].groupby(d[KEY]).diff(1)

        # expanding z-score per cow (adaptive baseline over time)
        exp_mean = grp.expanding().mean().reset_index(level=0, drop=True)
        exp_std  = grp.expanding().std().reset_index(level=0, drop=True).replace(0, np.nan)
        z = (d[c] - exp_mean) / exp_std
        d[f"{c}_z_cow"] = z.replace([np.inf, -np.inf], np.nan)

    # New feature columns (not in original df)
    fe_cols = [c for c in d.columns if c not in df.columns]
    d[fe_cols] = d[fe_cols].fillna(0)

    # Final hygiene: ensure a single clean key.
    d = coerce_and_dedup_key(d, KEY)
    return d

def take_last_k(df, k=6):
    """
    Keep only the last k visits per cow (focus on recent history).
    If TIME_COL is absent, use a synthetic visit index to define recency.
    """
    d = coerce_and_dedup_key(df.copy(), KEY)
    if TIME_COL in d.columns:
        d["_rank_last"] = d.groupby(KEY)[TIME_COL].rank(method="first", ascending=False)
    else:
        if "_visit_idx" not in d.columns:
            d["_visit_idx"] = d.groupby(KEY).cumcount()
        d["_rank_last"] = d.groupby(KEY)["_visit_idx"].rank(method="first", ascending=False)
    out = d[d["_rank_last"] <= k].drop(columns=["_rank_last"])
    return coerce_and_dedup_key(out.reset_index(drop=True), KEY)

def drop_degenerate(train_df, val_df, test_df, key, ycol):
    """
    Drop degenerate features using TRAIN statistics only:
      • columns all-NaN on TRAIN,
      • zero-variance columns on TRAIN.
    Then align VAL/TEST to the kept set. Preserve key/target/time/index helpers.
    """
    # Ensure exactly one key/target column each.
    train_df = coerce_and_dedup_key(coerce_and_dedup_target(train_df, ycol), key)
    val_df   = coerce_and_dedup_key(coerce_and_dedup_target(val_df,   ycol), key)
    test_df  = coerce_and_dedup_key(coerce_and_dedup_target(test_df,  ycol), key)

    keep = []
    for c in train_df.columns:
        if c in {key, ycol, TIME_COL, "_visit_idx", "Cow_ID_norm"}:
            keep.append(c); continue
        if str(train_df[c].dtype).startswith(("float","int")):
            col = train_df[c]
            if col.isna().all():
                continue
            if col.nunique(dropna=True) <= 1:
                continue
            keep.append(c)
    tr2 = train_df[[k for k in keep if k in train_df.columns] + [key, ycol]].copy()
    va2 = val_df[[k for k in keep if k in val_df.columns] + [key, ycol]].copy()
    te2 = test_df[[k for k in keep if k in test_df.columns] + [key, ycol]].copy()

    # Final hygiene
    tr2 = coerce_and_dedup_key(coerce_and_dedup_target(tr2, ycol), key)
    va2 = coerce_and_dedup_key(coerce_and_dedup_target(va2, ycol), key)
    te2 = coerce_and_dedup_key(coerce_and_dedup_target(te2, ycol), key)

    dropped = sorted(list(set(train_df.columns) - set(tr2.columns)))
    return tr2, va2, te2, dropped

def safe_target_series(df: pd.DataFrame, ycol: str) -> pd.Series:
    """Return a single integer Series for the target, consolidating multi-columns if needed."""
    obj = df[ycol]
    if isinstance(obj, pd.DataFrame):
        y = obj.apply(pd.to_numeric, errors="coerce").fillna(0).max(axis=1)
    else:
        y = pd.to_numeric(obj, errors="coerce").fillna(0)
    return y.astype(int)

def count_pos_visits(df, ycol):
    """Number of visits labelled positive (1)."""
    y = safe_target_series(df, ycol)
    return int((y == 1).sum())

def count_pos_cows(df, key, ycol):
    """Number of cows with at least one positive visit (max over visits per cow)."""
    df = coerce_and_dedup_key(df.copy(), key)
    y = safe_target_series(df, ycol)
    per_cow = df.assign(__y=y).groupby(key)["__y"].max()
    return int(per_cow.sum())

# ---------- Split-specific (no-leak) feature engineering ---------------------
# Why: compute temporal stats (lags/rolling/expanding) separately per split to avoid leakage.
train_raw = split_by_cows(base, tr_cows)
val_raw   = split_by_cows(base, val_cows)
test_raw  = split_by_cows(base, test_cows)

# De-duplicate key/target BEFORE feature engineering (stability).
train_raw = coerce_and_dedup_key(coerce_and_dedup_target(train_raw, YCOL), KEY)
val_raw   = coerce_and_dedup_key(coerce_and_dedup_target(val_raw,   YCOL), KEY)
test_raw  = coerce_and_dedup_key(coerce_and_dedup_target(test_raw,  YCOL), KEY)

train_fe = add_time_features(train_raw, num_cols_all)
val_fe   = add_time_features(val_raw,   num_cols_all)
test_fe  = add_time_features(test_raw,  num_cols_all)

# ---------- Optional: keep only the last K visits per cow --------------------
V_LAST = 6  # tune as needed
train_sel = take_last_k(train_fe, V_LAST)
val_sel   = take_last_k(val_fe,   V_LAST)
test_sel  = take_last_k(test_fe,  V_LAST)

# De-duplicate key/target AGAIN (post FE/filters)
train_sel = coerce_and_dedup_key(coerce_and_dedup_target(train_sel, YCOL), KEY)
val_sel   = coerce_and_dedup_key(coerce_and_dedup_target(val_sel,   YCOL), KEY)
test_sel  = coerce_and_dedup_key(coerce_and_dedup_target(test_sel,  YCOL), KEY)

# ---------- Drop degenerate features (train-driven) --------------------------
train_df, val_df, test_df, dropped_cols = drop_degenerate(train_sel, val_sel, test_sel, KEY, YCOL)

# ---------- Diagnostics -------------------------------------------------------
print(f"[FE-visit] rows — TRAIN {train_df.shape} | VAL {val_df.shape} | TEST {test_df.shape}")
print(f"[FE-visit] visits+ ({YCOL}) — TR {count_pos_visits(train_df,YCOL)} | VA {count_pos_visits(val_df,YCOL)} | TE {count_pos_visits(test_df,YCOL)}")
print(f"[FE-visit] cows+ (max-per-cow) — TR {count_pos_cows(train_df,KEY,YCOL)} | VA {count_pos_cows(val_df,KEY,YCOL)} | TE {count_pos_cows(test_df,KEY,YCOL)}")
feat_cnt = len([c for c in train_df.columns if c not in {KEY, YCOL, TIME_COL, '_visit_idx', 'Cow_ID_norm'}])
print(f"[READY] KEY='{KEY}' | YCOL='{YCOL}' | TIME_COL='{TIME_COL}' | Num features={feat_cnt}")
if dropped_cols:
    print(f"[NOTE] Dropped degenerate columns: {dropped_cols[:10]}{' ...' if len(dropped_cols)>10 else ''}")


[2.5 Fallback] Rebuilding cow-based splits…
[2.5 Fallback] Train cows: 660 | Val cows: 220 | Test cows: 220
[FE-visit] rows — TRAIN (3960, 99) | VAL (1320, 99) | TEST (1320, 99)
[FE-visit] visits+ (class1) — TR 666 | VA 222 | TE 222
[FE-visit] cows+ (max-per-cow) — TR 111 | VA 37 | TE 37
[READY] KEY='Cow_ID_match' | YCOL='class1' | TIME_COL='Day' | Num features=96
[NOTE] Dropped degenerate columns: ['Breed', 'Cow_ID', 'Hardness_d1', 'Hardness_r3_d1', 'Hardness_r5_d1', 'Hardness_z_cow', 'Milk_visibility_d1', 'Milk_visibility_r3_d1', 'Milk_visibility_r5_d1', 'Milk_visibility_z_cow'] ...


In [4]:
# ===== 3) Image Index Builder (COCO-based) — robust cow-id alignment =====
# Goal: build `df_images` from a COCO JSON, resolve absolute file paths under IMAGE_DIR,
# and align each image to a cow-level label coming from the VISIT-LEVEL TARGET (df_risk from Cell 2).
# Why this way:
#   • Old YOLO TXT labels are not trusted (hashed/unusable).
#   • COCO defines canonical `images`/`annotations`/`categories` for indexing.
#   • Supervision should follow the ADAPTIVE TARGET built in Cell 2 (df_risk → risk_next),
#     aggregated to cow-level with a max-over-visits per cow.
#   • If cow-id cannot be inferred from path, we can optionally use an explicit CSV map: `file_name,Cow_ID`.

import json, re, os, pandas as pd, numpy as np
from pathlib import Path

# --- Preconditions from Cells 1–2 --------------------------------------------
assert 'IMAGE_DIR' in globals() and isinstance(IMAGE_DIR, Path), "IMAGE_DIR must come from Cell 1."
assert 'COCO_JSON_PATH' in globals() and isinstance(COCO_JSON_PATH, Path), "COCO_JSON_PATH must come from Cell 1."
assert 'df_risk' in globals(), "Run Cell 2 first: 'df_risk' (visit-level target) is required."
assert 'KEY' in globals(), "Run Cell 2 first: KEY (cow id col in df_risk) must be defined."
assert COCO_JSON_PATH.exists(), f"COCO JSON not found: {COCO_JSON_PATH}"
assert IMAGE_DIR.exists(), f"IMAGE_DIR not found: {IMAGE_DIR}"

# --- Config knobs -------------------------------------------------------------
# Use mapping only if it's a real file (not empty, not a directory)
USE_MAPPING_CSV = (
    'MAPPING_CSV_PATH' in globals()
    and MAPPING_CSV_PATH
    and isinstance(MAPPING_CSV_PATH, Path)
    and MAPPING_CSV_PATH.is_file()
)

# Optional: imaging-only labels from COCO categories (NOT used for fusion).
USE_COCO_AS_IMAGING_LABELS = False
POSITIVE_CATEGORIES = {"mastitis", "lesion", "inflammation"}  # adapt to your COCO taxonomy

print("[Config] IMAGE_DIR:", IMAGE_DIR)
print("[Config] COCO_JSON_PATH:", COCO_JSON_PATH)
print("[Config] MAPPING_CSV_PATH:", (MAPPING_CSV_PATH if USE_MAPPING_CSV else "None/Not a file"))
print("[Config] USE_MAPPING_CSV:", USE_MAPPING_CSV)

# --- Helpers -----------------------------------------------------------------
def normalize_cow_id(x) -> str:
    """Map 'Cow_01'/'COW-003'/3 → 'cow1'/'cow3'. If only digits exist, still map to 'cowN'."""
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    s = str(x).strip().lower()
    m = re.search(r"cow[_\-]?(\d+)", s)
    if m:
        return f"cow{int(m.group(1))}"
    m2 = re.search(r"\b(\d{1,6})\b", s)
    if m2:
        return f"cow{int(m2.group(1))}"
    return s

def find_image_path(root: Path, fn: str) -> str:
    """
    Resolve a COCO `file_name` to an absolute path under IMAGE_DIR (robust to subfolders).
    1) root/fn if exists; 2) search by basename anywhere under root.
    """
    p = root / fn
    if p.exists():
        return str(p)
    hits = list(root.rglob(Path(fn).name))
    return str(hits[0]) if hits else ""

def extract_cow_id_from_path(p: str) -> str:
    """
    Heuristic cow-id inference from the path:
      • Look for tokens like 'cow3', 'cow_003', 'Cow-12' in any folder part.
      • Fallback: use a 1–6 digit cluster in the filename and map to 'cowN'.
    Prefer an explicit mapping CSV when folders are flat.
    """
    path = Path(p)
    for part in path.parts[::-1]:
        m = re.search(r"\b(cow[_\-]?\d{1,6})\b", part.lower())
        if m:
            return normalize_cow_id(m.group(1))
    base = path.stem
    m2 = re.search(r"(\d{1,6})", base)
    return normalize_cow_id(m2.group(1)) if m2 else ""

# --- Load COCO ---------------------------------------------------------------
with open(COCO_JSON_PATH, "r") as f:
    coco = json.load(f)

coco_images = pd.DataFrame(coco.get("images", []))
coco_cats   = pd.DataFrame(coco.get("categories", []))
coco_ann    = pd.DataFrame(coco.get("annotations", []))
print(f"[COCO] images={len(coco_images)} | annotations={len(coco_ann)} | categories={len(coco_cats)}")

# Resolve absolute paths + basic fields
coco_images = coco_images.copy()
coco_images["path"] = coco_images["file_name"].apply(lambda fn: find_image_path(IMAGE_DIR, fn))
coco_images["path_exists"] = coco_images["path"].apply(lambda p: bool(p) and Path(p).exists())
coco_images["stem"] = coco_images["file_name"].apply(lambda fn: Path(fn).stem)

missing = coco_images[~coco_images["path_exists"]]
if len(missing) > 0:
    print(f"[WARN] {len(missing)} images from COCO were not found under IMAGE_DIR. Showing first 5:")
    display(missing.head(5)[["id","file_name","path"]])

# --- Optional explicit mapping: filename -> Cow_ID ---------------------------
if USE_MAPPING_CSV:
    print(f"[Mapping] Using mapping CSV: {MAPPING_CSV_PATH}")
    mp = pd.read_csv(MAPPING_CSV_PATH)
    assert {"file_name","Cow_ID"}.issubset(mp.columns), \
        "Mapping CSV must have columns: file_name,Cow_ID"
    mp["Cow_ID"] = mp["Cow_ID"].apply(normalize_cow_id)
    coco_images = coco_images.merge(mp[["file_name","Cow_ID"]], on="file_name", how="left")
    coco_images.rename(columns={"Cow_ID": "_cid_"}, inplace=True)
else:
    print("[Mapping] No valid mapping CSV file detected (skipping).")
    coco_images["_cid_"] = coco_images["path"].apply(extract_cow_id_from_path)

# --- Cow-level label from df_risk (Cell 2) -----------------------------------
# Use the ADAPTIVE VISIT-LEVEL TARGET 'risk_next' aggregated per cow to get a stable y_img.
df_risk_norm = df_risk.copy()
df_risk_norm["_cid_"] = df_risk_norm[KEY].apply(normalize_cow_id)
y_per_cow = (
    df_risk_norm.groupby("_cid_")["risk_next"]
    .apply(lambda s: int(pd.to_numeric(s, errors="coerce").fillna(0).max()))
    .rename("y_img")
    .reset_index()
)

df_images = (
    coco_images[["id","file_name","path","path_exists","stem","_cid_"]]
    .merge(y_per_cow, on="_cid_", how="left")
)

# --- Optional: imaging-only labels from COCO categories ----------------------
if USE_COCO_AS_IMAGING_LABELS and not coco_cats.empty and not coco_ann.empty:
    cat_map = {int(row["id"]): str(row.get("name","")).lower() for _, row in coco_cats.iterrows()}
    ann = coco_ann.copy()
    ann["cat_name"] = ann["category_id"].map(cat_map)
    ann["is_pos"] = ann["cat_name"].isin({c.lower() for c in POSITIVE_CATEGORIES})
    pos_by_img = ann.groupby("image_id")["is_pos"].max().reset_index()
    pos_by_img.rename(columns={"is_pos":"y_img_coco"}, inplace=True)
    df_images = df_images.merge(pos_by_img, left_on="id", right_on="image_id", how="left")
    df_images.drop(columns=["image_id"], inplace=True)
    df_images["y_img_coco"] = df_images["y_img_coco"].fillna(False).astype(int)
else:
    df_images["y_img_coco"] = np.nan  # not used

# --- Final hygiene & diagnostics ---------------------------------------------
df_images["y_img_present"] = df_images["y_img"].notna().astype(int)
df_images = df_images[df_images["path_exists"]].reset_index(drop=True)

print(f"[Images] df_images shape: {df_images.shape}")
print(f"[Images] with cow-id (_cid_) present: {df_images['_cid_'].notna().sum()} / {len(df_images)}")
print(f"[Images] with cow-level label (y_img): {int(df_images['y_img_present'].sum())} / {len(df_images)}")

dbg_cols = ["file_name","path","_cid_","y_img","y_img_coco"]
print("\n[Debug] Example rows with cow-level label:")
display(df_images[df_images["y_img_present"] == 1][dbg_cols].head(10))
print("\n[Debug] Example rows missing cow-level label:")
display(df_images[df_images["y_img_present"] == 0][dbg_cols].head(10))

# Overlap with df_risk cows (sanity)
img_cows = set(df_images["_cid_"].dropna().astype(str).unique())
risk_cows = set(df_risk_norm["_cid_"].dropna().astype(str).unique())
print(f"[Overlap] cows in images: {len(img_cows)} | in df_risk: {len(risk_cows)} | intersection: {len(img_cows & risk_cows)}")

# Guidance:
#   • If many images lack y_img: check KEY/normalisation consistency and whether those cows exist in df_risk.
#   • If folders are flat (no 'cowX'), provide MAPPING_CSV_PATH (filename→Cow_ID) to avoid heuristic errors.
#   • For imaging-only experiments without cow labels, set USE_COCO_AS_IMAGING_LABELS=True.


[Config] IMAGE_DIR: /content/drive/MyDrive/Mastitis_illness_cow/datasets/images
[Config] COCO_JSON_PATH: /content/drive/MyDrive/Mastitis_illness_cow/exports/_annotations.coco.json
[Config] MAPPING_CSV_PATH: /content/drive/MyDrive/Mastitis_illness_cow/datasets/image_to_cow_map.csv
[Config] USE_MAPPING_CSV: True
[COCO] images=130 | annotations=185 | categories=6
[Mapping] Using mapping CSV: /content/drive/MyDrive/Mastitis_illness_cow/datasets/image_to_cow_map.csv
[Images] df_images shape: (130, 9)
[Images] with cow-id (_cid_) present: 130 / 130
[Images] with cow-level label (y_img): 0 / 130

[Debug] Example rows with cow-level label:


Unnamed: 0,file_name,path,_cid_,y_img,y_img_coco



[Debug] Example rows missing cow-level label:


Unnamed: 0,file_name,path,_cid_,y_img,y_img_coco
0,FLIR0179_jpg.rf.7b1370df26ea8498381f67453133af...,/content/drive/MyDrive/Mastitis_illness_cow/da...,,,
1,FLIR0227_jpg.rf.845a66986fb6d4d9648aa314ced09e...,/content/drive/MyDrive/Mastitis_illness_cow/da...,,,
2,FLIR1445_jpg.rf.045fbc0881e974ff438962ed621fac...,/content/drive/MyDrive/Mastitis_illness_cow/da...,,,
3,FLIR1867_jpg.rf.efa16aea0933b52816d3df8e3c6f03...,/content/drive/MyDrive/Mastitis_illness_cow/da...,,,
4,FLIR1695_jpg.rf.8265732ed9ecf71b800da75ac6e20d...,/content/drive/MyDrive/Mastitis_illness_cow/da...,,,
5,FLIR0843_jpg.rf.11aa52a9b9c110de0f267c09a1c2d6...,/content/drive/MyDrive/Mastitis_illness_cow/da...,,,
6,FLIR1509_jpg.rf.0c7f6501cb7be8160df19e17973aa0...,/content/drive/MyDrive/Mastitis_illness_cow/da...,,,
7,FLIR0983_jpg.rf.e5f677846f89da4b40af8dd12e19e7...,/content/drive/MyDrive/Mastitis_illness_cow/da...,,,
8,FLIR1833_jpg.rf.c071bcf03f96633dcb00121c8028e8...,/content/drive/MyDrive/Mastitis_illness_cow/da...,,,
9,FLIR1647_jpg.rf.0eede5710a1a7b5f5adff4ee6dc4d5...,/content/drive/MyDrive/Mastitis_illness_cow/da...,,,


[Overlap] cows in images: 1 | in df_risk: 1100 | intersection: 0


In [5]:
# ===== 4) Imaging model — EfficientNet frozen + Augment + TTA + cow-stratified split =====
# Purpose:
#   Train an image-only branch consistent with the clinical target (y_img) derived from df_risk.
#   We: (1) keep only images with y_img present, (2) split cows (not images) into TR/VAL/TEST,
#   (3) use a frozen EfficientNet as feature extractor, (4) add Augment, Oversampling (optional), and TTA,
#   (5) report image-level metrics and cow-level aggregated probabilities.
#
# Inputs expected from previous cells:
#   • df_images with columns: ['path','_cid_','y_img', ...]
#   • df_images['y_img'] must be 0/1 (or NaN for unknown -> dropped here).
#
# Notes:
#   • No identity leakage: cows are disjoint across splits.
#   • Late-fusion later will use the cow-level outputs built here.

import os, math, numpy as np, pandas as pd
import torch, torchvision
import torchvision.transforms as T
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from PIL import Image
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedShuffleSplit

# ------------------- Config -------------------
SEED = 42
rng = np.random.RandomState(SEED)
torch.manual_seed(SEED)
IMG_SIZE = 224
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PIN_MEM = torch.cuda.is_available()

# Image split configuration (by cow)
IMG_TRAIN_FRAC = 0.60
IMG_VAL_FRAC   = 0.20   # remainder → TEST
K_VIEWS_TRAIN  = 5      # augmentation views per TRAIN image
USE_OVERSAMPLING = True # oversample minority class on TRAIN
TTA_N_VIEWS    = 8      # TTA views for VAL/TEST (0 disables TTA)
BATCH_TRAIN    = 32
BATCH_EVAL     = 64
NUM_WORKERS    = 2

# ------------------- Preconditions & filtering -------------------
assert 'df_images' in globals() and len(df_images) > 0, "df_images not available. Run Cell 3."
assert {'path','_cid_','y_img'}.issubset(df_images.columns), "df_images must have path, _cid_, y_img."

# Keep only supervised images (have cow label)
dfi_all = df_images.copy()
dfi_all = dfi_all[dfi_all['y_img'].notna()].reset_index(drop=True)

# Normalise cow id (safety)
import re
def normalize_cow_id(x):
    if x is None or (isinstance(x, float) and pd.isna(x)): return ""
    s = str(x).strip().lower()
    m = re.search(r"cow[_\-]?(\d+)", s)
    if m: return f"cow{int(m.group(1))}"
    m2 = re.search(r"\b(\d{1,6})\b", s)
    if m2: return f"cow{int(m2.group(1))}"
    return s

dfi_all["_cid_"] = dfi_all["_cid_"].apply(normalize_cow_id)
dfi_all["y_img"] = pd.to_numeric(dfi_all["y_img"], errors="coerce").fillna(0).astype(int)

if dfi_all.empty:
    print("[Imaging] No supervised images (y_img present). Skipping image model.")
else:
    # ------------------- Transforms -------------------
    train_tf = T.Compose([
        T.RandomResizedCrop(IMG_SIZE, scale=(0.90, 1.00), ratio=(0.98, 1.02)),
        T.RandomHorizontalFlip(p=0.5),
        T.RandomAffine(degrees=7, translate=(0.03, 0.03), scale=(0.98, 1.02)),
        T.GaussianBlur(kernel_size=3, sigma=(0.1, 0.8)),
        T.ToTensor(),
        T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
    ])
    eval_tf = T.Compose([
        T.Resize((IMG_SIZE, IMG_SIZE)),
        T.ToTensor(),
        T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
    ])

    # ------------------- Backbone (frozen EfficientNet) -------------------
    try:
        import timm
    except Exception as e:
        raise RuntimeError("Please install timm in this environment: pip install timm") from e

    class EffNetFeats(nn.Module):
        """Frozen EfficientNet feature extractor: outputs GAP features (no classifier head)."""
        def __init__(self, model_name="efficientnet_b0"):
            super().__init__()
            self.backbone = timm.create_model(model_name, pretrained=True, num_classes=0, global_pool="avg")
            for p in self.backbone.parameters(): p.requires_grad = False
        def forward(self, x): return self.backbone(x)

    feat_net = EffNetFeats("efficientnet_b0").to(device).eval()
    FEAT_DIM = feat_net.backbone.num_features

    # ------------------- Dataset -------------------
    class ImageDatasetK(Dataset):
        """
        Dataset that exposes K augmented views per image.
        Labels are repeated accordingly; transform controls augmentation.
        """
        def __init__(self, df_rows: pd.DataFrame, transform, k_views=1):
            self.paths  = df_rows["path"].tolist()
            self.labels = df_rows["y_img"].astype(int).tolist()
            self.tf = transform
            self.k = max(1, int(k_views))
        def __len__(self): return len(self.paths) * self.k
        def __getitem__(self, idx):
            i = idx % len(self.paths)
            im = Image.open(self.paths[i]).convert("RGB")
            return self.tf(im), self.labels[i]

    def extract_features(dloader):
        """Run frozen backbone to obtain features; return (X, y)."""
        X, y = [], []
        with torch.no_grad():
            for xb, yb in dloader:
                xb = xb.to(device)
                feats = feat_net(xb).cpu().numpy()
                X.append(feats); y.append(np.array(yb))
        X = np.vstack(X) if len(X) else np.zeros((0, FEAT_DIM))
        y = np.concatenate(y) if len(y) else np.array([])
        return X, y

    # ------------------- Cow-stratified split (image-only branch) -------------------
    cows = dfi_all.groupby("_cid_")["y_img"].max().reset_index()
    y_cow = cows["y_img"].astype(int).values
    C     = cows["_cid_"].astype(str).values

    # TRAIN vs (VAL+TEST)
    if len(np.unique(y_cow)) < 2 or len(cows) < 5:
        # Too few cows or single-class — fall back to simple split (still cow-based)
        perm = rng.permutation(len(C))
        n_tr = int(math.ceil(IMG_TRAIN_FRAC * len(C)))
        n_va = int(math.ceil(IMG_VAL_FRAC   * len(C)))
        tr_idx = perm[:n_tr]
        va_idx = perm[n_tr:n_tr+n_va]
        te_idx = perm[n_tr+n_va:]
    else:
        sss1 = StratifiedShuffleSplit(n_splits=1, test_size=(1.0-IMG_TRAIN_FRAC), random_state=SEED)
        tr_idx, tmp_idx = next(sss1.split(C, y_cow))
        C_tmp, y_tmp = C[tmp_idx], y_cow[tmp_idx]
        test_frac_rel = (1.0 - IMG_TRAIN_FRAC - IMG_VAL_FRAC) / (1.0 - IMG_TRAIN_FRAC)
        if len(np.unique(y_tmp)) < 2 or len(C_tmp) < 3:
            # small/degenerate tmp: simple split
            perm2 = rng.permutation(len(C_tmp))
            cut = int(math.ceil((1.0 - test_frac_rel) * len(C_tmp)))
            va_idx_rel, te_idx_rel = perm2[:cut], perm2[cut:]
        else:
            sss2 = StratifiedShuffleSplit(n_splits=1, test_size=test_frac_rel, random_state=SEED)
            va_idx_rel, te_idx_rel = next(sss2.split(C_tmp, y_tmp))
        # remap to original indices
        tr_idx = tr_idx
        va_idx = tmp_idx[va_idx_rel]
        te_idx = tmp_idx[te_idx_rel]

    C_tr, C_va, C_te = C[tr_idx], C[va_idx], C[te_idx]
    tr_img = dfi_all[dfi_all["_cid_"].isin(set(C_tr))].reset_index(drop=True)
    va_img = dfi_all[dfi_all["_cid_"].isin(set(C_va))].reset_index(drop=True)
    te_img = dfi_all[dfi_all["_cid_"].isin(set(C_te))].reset_index(drop=True)

    print(f"[Imaging|Split] COWS — TRAIN: {len(C_tr)} | VAL: {len(C_va)} | TEST: {len(C_te)}")
    print(f"[Imaging|Split] IMAGES — TRAIN: {len(tr_img)} | VAL: {len(va_img)} | TEST: {len(te_img)}")
    if len(np.unique(y_cow))>=2:
        print(f"[Imaging|Class balance @cow] TRAIN pos={int((cows.iloc[tr_idx]['y_img']==1).sum())}/{len(tr_idx)} "
              f"| VAL pos={int((cows.iloc[va_idx]['y_img']==1).sum())}/{len(va_idx)} "
              f"| TEST pos={int((cows.iloc[te_idx]['y_img']==1).sum())}/{len(te_idx)}")

    # ------------------- Datasets & Samplers -------------------
    train_ds = ImageDatasetK(tr_img, transform=train_tf, k_views=K_VIEWS_TRAIN)
    val_ds   = ImageDatasetK(va_img, transform=eval_tf,   k_views=1)
    test_ds  = ImageDatasetK(te_img, transform=eval_tf,   k_views=1)

    # Oversampling on TRAIN (image-level, replicated across K views)
    y_train_base = tr_img["y_img"].astype(int).values
    class_counts = np.bincount(y_train_base) if y_train_base.size else np.array([0,0])
    sampler = None
    if USE_OVERSAMPLING and class_counts.size==2 and class_counts.min()>0:
        class_weights = 1.0 / class_counts
        sample_weights = np.array([class_weights[y_train_base[i % len(y_train_base)]] for i in range(len(train_ds))])
        sampler = WeightedRandomSampler(weights=torch.from_numpy(sample_weights).float(),
                                        num_samples=len(train_ds), replacement=True)

    tr_dl = DataLoader(train_ds, batch_size=BATCH_TRAIN, shuffle=(sampler is None),
                       sampler=sampler, num_workers=NUM_WORKERS, pin_memory=PIN_MEM)
    va_dl = DataLoader(val_ds,   batch_size=BATCH_EVAL,  shuffle=False,
                       num_workers=NUM_WORKERS, pin_memory=PIN_MEM)
    te_dl = DataLoader(test_ds,  batch_size=BATCH_EVAL,  shuffle=False,
                       num_workers=NUM_WORKERS, pin_memory=PIN_MEM) if len(te_img)>0 else None

    print(f"[Imaging|Train loader] items={len(train_ds)}  (base_imgs={len(tr_img)} × K_VIEWS={K_VIEWS_TRAIN})"
          + (f"  | oversampling=ON" if sampler is not None else "  | oversampling=OFF"))

    # ------------------- Feature Extraction -------------------
    def extract_all(dl):
        X, y = extract_features(dl); return X, y
    Xtr, ytr = extract_all(tr_dl)
    Xva, yva = extract_all(va_dl)
    if te_dl is not None:
        Xte, yte = extract_all(te_dl)
    else:
        Xte, yte = np.zeros((0, Xtr.shape[1])) if Xtr.size else np.array([]), np.array([])

    # ------------------- Simple classifier on top of features -------------------
    def feature_mixup(X, y, alpha=0.4, n_new=None, rng=rng):
        """Label-preserving interpolation in embedding space to regularise the linear head."""
        if X.shape[0] < 2: return X, y
        if n_new is None: n_new = X.shape[0] // 2
        i1 = rng.randint(0, X.shape[0], n_new); i2 = rng.randint(0, X.shape[0], n_new)
        lam = rng.beta(alpha, alpha, size=n_new)[:, None]
        Xn = lam*X[i1] + (1-lam)*X[i2]
        yn = ((lam[:, 0]*y[i1] + (1-lam[:, 0])*y[i2]) >= 0.5).astype(int)
        return np.vstack([X, Xn]), np.concatenate([y, yn])

    if Xtr.shape[0] > 0 and len(np.unique(ytr)) >= 2:
        Xtr_aug, ytr_aug = feature_mixup(Xtr, ytr, alpha=0.4, n_new=Xtr.shape[0]//2)
        clf = LogisticRegression(max_iter=4000, class_weight='balanced', solver='lbfgs', n_jobs=None)
        clf.fit(Xtr_aug, ytr_aug)

        # --- Predictions with TTA ---
        def predict_with_tta(paths, clf, n_views=TTA_N_VIEWS):
            if n_views <= 0:
                ds = ImageDatasetK(pd.DataFrame({"path": paths, "y_img": [0]*len(paths)}), eval_tf, k_views=1)
                dl = DataLoader(ds, batch_size=BATCH_EVAL, shuffle=False, num_workers=NUM_WORKERS, pin_memory=PIN_MEM)
                X,_ = extract_features(dl)
                return clf.predict_proba(X)[:, 1]
            # mild jitter, averaged
            aug_eval = T.Compose([
                T.Resize((IMG_SIZE, IMG_SIZE)),
                T.RandomHorizontalFlip(p=0.5),
                T.RandomAffine(degrees=3, translate=(0.01, 0.01), scale=(0.995, 1.005)),
                T.ToTensor(),
                T.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]),
            ])
            all_probs = []
            for _ in range(n_views):
                ds = ImageDatasetK(pd.DataFrame({"path": paths, "y_img": [0]*len(paths)}), aug_eval, k_views=1)
                dl = DataLoader(ds, batch_size=BATCH_EVAL, shuffle=False, num_workers=NUM_WORKERS, pin_memory=PIN_MEM)
                X,_ = extract_features(dl)
                all_probs.append(clf.predict_proba(X)[:, 1])
            return np.mean(np.vstack(all_probs), axis=0)

        p_val_img  = predict_with_tta(va_img['path'].tolist(), clf, n_views=TTA_N_VIEWS)
        p_test_img = predict_with_tta(te_img['path'].tolist(), clf, n_views=TTA_N_VIEWS) if len(te_img)>0 else np.array([])

        # Image-level metrics
        if len(np.unique(yva)) == 2:
            print(f"[Imaging] VAL image-level — AUROC={roc_auc_score(yva, p_val_img):.4f} | "
                  f"AUPRC={average_precision_score(yva, p_val_img):.4f} | N={len(yva)}")
        if len(yte)>0 and len(np.unique(yte)) == 2:
            print(f"[Imaging] TEST image-level — AUROC={roc_auc_score(yte, p_test_img):.4f} | "
                  f"AUPRC={average_precision_score(yte, p_test_img):.4f} | N={len(yte)}")
    else:
        print("[Imaging][WARN] Not enough training images or only one class in training. Skipping classifier.")
        p_val_img = np.array([]); p_test_img = np.array([])

    # ------------------- Per-cow aggregation -------------------
    def agg_per_cow(df_rows: pd.DataFrame, probs: np.ndarray, cow_col: str, target_col="y_img") -> pd.DataFrame:
        """
        Aggregate image-level probabilities to cow-level by mean; cow label is max over images.
        Returns [cow_id, y, p_img, n] where p_img is mean per cow and n is image count.
        """
        if probs.size == 0 or len(df_rows) == 0:
            return pd.DataFrame(columns=[cow_col, "y", "p_img", "n"])
        tmp = df_rows.copy(); tmp["proba"] = probs
        return tmp.groupby(cow_col).agg(y=(target_col,"max"), p_img=("proba","mean"), n=("proba","count")).reset_index()

    val_img_cow  = agg_per_cow(va_img, p_val_img,  cow_col="_cid_", target_col="y_img")
    test_img_cow = agg_per_cow(te_img, p_test_img, cow_col="_cid_", target_col="y_img") if len(te_img)>0 else pd.DataFrame(columns=["_cid_","y","p_img","n"])

    print(f"[Imaging] Output per-cow — VAL cows: {len(val_img_cow)} | TEST cows: {len(test_img_cow)}")
    # These will be used in the fusion cell (we'll join on cow id '_cid_').


[Imaging] No supervised images (y_img present). Skipping image model.


In [6]:
# =======================
# Cell 5 — Tabular-only v3.1 (leak-safe, visit→cow pooling + calibration)
# Purpose
#   • Train a robust tabular baseline on visit-level features and aggregate to cow-level.
#   • No imaging prerequisites. If imaging exists later, fusion can reuse pva_c/pte_c and yva_cow/yte_cow.
# Guarantees
#   • No leakage: cows are disjoint across train/val/test (from Cell 2.5).
#   • Outputs aligned series: pva_c / pte_c keyed by yva_cow.index / yte_cow.index.
# =======================

import numpy as np, pandas as pd, warnings
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

warnings.filterwarnings("ignore", category=UserWarning)

# ---------- 0) Preconditions (tabular only) ----------
req = ["train_df","val_df","test_df","KEY","YCOL"]
missing = [k for k in req if k not in globals()]
if missing:
    raise AssertionError(f"Prerequisites missing for tabular model: {missing}. "
                         f"Run Cell 2 and 2.5 first to build train_df/val_df/test_df, KEY, YCOL.")

# Short aliases
tr, va, te = train_df.copy(), val_df.copy(), test_df.copy()
KEY_ = KEY
Y_   = YCOL

# Sanity: ensure key/target present and clean types
for nm, d in [("train",tr),("val",va),("test",te)]:
    if KEY_ not in d.columns:
        raise KeyError(f"{nm}_df is missing '{KEY_}'")
    if Y_ not in d.columns:
        raise KeyError(f"{nm}_df is missing target '{Y_}'")
    d[KEY_] = d[KEY_].astype(str)
    d[Y_]   = pd.to_numeric(d[Y_], errors="coerce").fillna(0).astype(int)

# ---------- 1) Feature whitelist (strict numeric, no time/meta) ----------
# Avoid obvious leakers/meta columns; keep only numeric predictors.
EXCLUDE = {KEY_, Y_, "Cow_ID_norm", "risk_next", "risk_h1", "onset_day",
           "Day", "_visit_idx", "visit_time", "datetime", "VisitDate"}
num_cols = (
    tr.drop(columns=[c for c in EXCLUDE if c in tr.columns], errors="ignore")
      .select_dtypes(include=[np.number]).columns.tolist()
)
if not num_cols:
    raise RuntimeError("No numeric features available after exclusions. "
                       "Check Cell 2.5 feature engineering output.")

# ---------- 2) Preprocess pipeline ----------
pre = ColumnTransformer([
    ("num", Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("sc",  StandardScaler()),
    ]), num_cols)
], remainder="drop", verbose_feature_names_out=False)

pre.fit(tr[num_cols])

def mat(df):
    X = pre.transform(df[num_cols])
    y = df[Y_].astype(int).values
    k = df[KEY_].astype(str).values
    return k, X, y

Ktr, Xtr, ytr = mat(tr)
Kva, Xva, yva = mat(va)
Kte, Xte, yte = mat(te)

# ---------- 3) Two strong visit-level models: LR(EN) + HGB ----------
pos_rate = max(1e-6, float((ytr == 1).mean()))
w_pos = 0.5/pos_rate; w_neg = 0.5/(1.0 - pos_rate)
w_tr  = np.where(ytr == 1, w_pos, w_neg)

lr = LogisticRegression(
    max_iter=4000, solver="saga", penalty="elasticnet",
    l1_ratio=0.35, C=1.0, random_state=42, n_jobs=-1
)
hgb = HistGradientBoostingClassifier(
    learning_rate=0.15, max_leaf_nodes=31, min_samples_leaf=25,
    l2_regularization=0.0, max_depth=None, random_state=42
)

# Fit
lr.fit(Xtr, ytr, sample_weight=w_tr)
hgb.fit(Xtr, ytr, sample_weight=w_tr)

def proba(clf, X):
    return clf.predict_proba(X)[:,1] if hasattr(clf, "predict_proba") else clf.decision_function(X)

pva_lr,  pte_lr  = proba(lr,  Xva), proba(lr,  Xte)
pva_hgb, pte_hgb = proba(hgb, Xva), proba(hgb, Xte)

# ---------- 4) Visit→cow pooling (pre-event masking + robust pooling) ----------
def logistic_temp(p, tau=2.3):
    p = np.clip(p, 1e-9, 1-1e-9)
    z = np.log(p/(1-p))/tau
    return 1/(1+np.exp(-z))

def pool_visits(keys, probs, tau=2.3, r=0.8, topk=3, jitter=0.006, seed=42, pre_event=True):
    """
    Convert visit-level probabilities to cow-level score.
    Steps:
      • temperature scaling in logit space,
      • small Gaussian jitter to break ties,
      • optional pre-event exclusion (drop last visit per cow),
      • robust pooling over 'excess over cow median' with power-mean and top-k mean.
    Returns a pd.Series indexed by cow key.
    """
    pt = logistic_temp(np.asarray(probs, float), tau=tau)
    if jitter and jitter > 0:
        rng = np.random.default_rng(seed)
        pt = np.clip(pt + rng.normal(0.0, jitter, size=pt.shape), 1e-9, 1-1e-9)
    dfp = pd.DataFrame({"k": keys, "pt": pt})

    if pre_event:
        last_idx = dfp.groupby("k").tail(1).index
        dfp = dfp.drop(index=last_idx)

    if len(dfp) == 0:
        return pd.Series(dtype=float)

    med = dfp.groupby("k")["pt"].transform("median")
    exc = (dfp["pt"] - med).clip(lower=0)

    def pmean(x, rr):
        v = x.values
        return (((v**rr).mean())**(1.0/rr)) if v.size > 0 else np.nan
    def topk_mean(x, kk):
        v = np.sort(x.values)
        if v.size == 0: return np.nan
        kk = min(kk, v.size)
        return float(v[-kk:].mean())

    pm = exc.groupby(dfp["k"]).apply(lambda s: pmean(s, r))
    tk = exc.groupby(dfp["k"]).apply(lambda s: topk_mean(s, topk))
    return (pm + tk) / 2.0

def ranknorm(x):
    r = np.argsort(np.argsort(x))
    return r / max(len(x)-1, 1)

# Cow-level labels (max over visits)
yva_cow = va.groupby(KEY_)[Y_].max().astype(int)
yte_cow = te.groupby(KEY_)[Y_].max().astype(int)

# Small grid search for pooling hyperparams on VAL AUPRC
taus  = [2.0, 2.3, 2.6]
rs    = [0.7, 0.8, 0.9]
topks = [2, 3, 4]
best = None
for tau in taus:
    for r in rs:
        for k in topks:
            va_lr_c  = pool_visits(Kva, pva_lr,  tau=tau, r=r, topk=k).reindex(yva_cow.index).fillna(0.0).values
            va_hgb_c = pool_visits(Kva, pva_hgb, tau=tau, r=r, topk=k).reindex(yva_cow.index).fillna(0.0).values
            va_ens   = (ranknorm(va_lr_c) + ranknorm(va_hgb_c))/2.0
            ap = average_precision_score(yva_cow.values, va_ens)
            if (best is None) or (ap > best[0]):
                best = (ap, tau, r, k, va_lr_c, va_hgb_c, va_ens)

ap_best, TAU_B, R_B, K_B, va_lr_b, va_hgb_b, va_ens_b = best

# Apply best pooling on TEST
te_lr_b  = pool_visits(Kte, pte_lr,  tau=TAU_B, r=R_B, topk=K_B).reindex(yte_cow.index).fillna(0.0).values
te_hgb_b = pool_visits(Kte, pte_hgb, tau=TAU_B, r=R_B, topk=K_B).reindex(yte_cow.index).fillna(0.0).values
te_ens_b = (ranknorm(te_lr_b) + ranknorm(te_hgb_b))/2.0

# ---------- 5) Platt calibration on VAL (cow-level ensemble) ----------
from sklearn.linear_model import LogisticRegression as LRCal
cal = LRCal(max_iter=1000, random_state=42).fit(va_ens_b.reshape(-1,1), yva_cow.values.astype(int))
pva_c = cal.predict_proba(va_ens_b.reshape(-1,1))[:,1]
pte_c = cal.predict_proba(te_ens_b.reshape(-1,1))[:,1]

# ---------- 6) Metrics ----------
def metr(name, y, p):
    p = np.clip(p, 1e-9, 1-1e-9)
    try: auc = roc_auc_score(y, p)
    except: auc = np.nan
    ap = average_precision_score(y, p)
    br = brier_score_loss(y, p)
    return dict(name=name, AUROC=float(auc) if auc==auc else np.nan, AUPRC=float(ap), Brier=float(br), N=int(len(y)))

tab_summary = pd.DataFrame([
    metr("VAL TAB only",  yva_cow.values, pva_c),
    metr("TEST TAB only", yte_cow.values, pte_c),
])
print("\n=== Tabular-only Summary (cow-level) ===")
print(tab_summary[["name","AUROC","AUPRC","Brier","N"]].to_string(index=False))

# ---------- 7) Publish aligned globals for fusion (Cell 6) ----------
# These globals are consumed by the fusion cell; they are guaranteed aligned to yva_cow/yte_cow indices.
globals()["pva_c"]   = pva_c
globals()["pte_c"]   = pte_c
globals()["yva_cow"] = yva_cow  # pd.Series indexed by cow key
globals()["yte_cow"] = yte_cow  # pd.Series indexed by cow key

print("\n[READY] Exported for fusion: pva_c / pte_c aligned to yva_cow / yte_cow.")
print(f"[Pooling*] Best on VAL — AP={ap_best:.4f} | tau={TAU_B}, r={R_B}, topK={K_B}")
print(f"[Info] Features used ({len(num_cols)}): {num_cols[:12]}{' ...' if len(num_cols)>12 else ''}")



=== Tabular-only Summary (cow-level) ===
         name    AUROC    AUPRC    Brier   N
 VAL TAB only 0.741323 0.474406 0.125696 220
TEST TAB only 0.838502 0.645940 0.116652 220

[READY] Exported for fusion: pva_c / pte_c aligned to yva_cow / yte_cow.
[Pooling*] Best on VAL — AP=0.1116 | tau=2.6, r=0.9, topK=2
[Info] Features used (96): ['Months after giving birth', 'Previous_Mastits_status', 'IUFL', 'EUFL', 'IUFR', 'EUFR', 'IURL', 'EURL', 'IURR', 'EURR', 'Temperature', 'Hardness'] ...


In [7]:
# =======================
# Cell 6 — Hybrid imaging pipeline (COCO + YOLO labels) with auto-fallbacks
# What this does:
# - Reads labels from COCO JSON and (optionally) YOLO .txt.
# - Picks the label source with highest coverage.
# - Tries true cow-aligned splits for multimodal fusion; if coverage is too small or train is single-class,
#   it auto-falls back to image-level stratified splits (the recipe that previously performed well).
# - Uses ResNet18 embeddings + Logistic Regression, with calibration and fusion when tabular cow scores exist.
### THIS IS OUR ORIGINAL SINGLE SMALL TEST. WE REPLACE IT WITH A SUBJECT-GROUPED REPEATED CV TECHNIQUE RESTRICTED TO THE COWS THAT HAVE TERMAL IMAGES TO DRAW STASTICALLY ROBUST CONCLUSION AFTER WISE REVIEW FROM PEERING.
# =======================
import os, re, json, glob, time, warnings, random, sys
warnings.filterwarnings("ignore", category=UserWarning)
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# Torch / Vision
import torch, torchvision
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# ML
from sklearn.linear_model import LogisticRegression, LogisticRegression as LRCal
from sklearn.metrics import (average_precision_score, roc_auc_score, brier_score_loss,
                             roc_curve, precision_recall_curve, confusion_matrix)
from sklearn.model_selection import StratifiedShuffleSplit

# Plot
import matplotlib.pyplot as plt

# ===== 0) Configuration & environment =====
SEED    = 42
DEBUG   = True
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# Paths (from previous cells)
if 'PROJECT_DIR' not in globals():
    PROJECT_DIR = "/content/drive/MyDrive/Mastitis_illness_cow/datasets"
IMAGE_DIR = os.path.join(PROJECT_DIR, "images")
LABEL_DIR = os.path.join(PROJECT_DIR, "labels")  # for YOLO .txt fallback
COCO_JSON_PATH = os.path.join(os.path.dirname(PROJECT_DIR), "exports", "_annotations.coco.json")

SAVE_DIR   = "/content/mastitis_outputs"
FIGDIR     = os.path.join(PROJECT_DIR, "figures_and_tables")
os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(FIGDIR, exist_ok=True)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("[MOUNT] IN_COLAB:", 'google.colab' in sys.modules)
print("[PATHS] PROJECT_DIR:", PROJECT_DIR)
print("[PATHS] IMAGE_DIR exists:", os.path.isdir(IMAGE_DIR), "| LABEL_DIR exists:", os.path.isdir(LABEL_DIR))
print("[PATHS] COCO_JSON_PATH:", COCO_JSON_PATH, "| exists:", os.path.isfile(COCO_JSON_PATH))
print(f"[ENV] torch={torch.__version__} | torchvision={torchvision.__version__} | device={DEVICE}", flush=True)

IMG_EXTS = {".png",".jpg",".jpeg",".bmp",".tif",".tiff"}

# ===== 1) Index images once =====
stem2path = {}
for root, _, files in os.walk(IMAGE_DIR):
    for f in files:
        if os.path.splitext(f)[1].lower() in IMG_EXTS:
            stem2path[os.path.splitext(f)[0]] = os.path.join(root, f)
print(f"[Index] Indexed images: {len(stem2path)}")

# ===== 2) Build two candidate label tables: (A) COCO, (B) YOLO .txt =====
def resolve_path_by_filename(file_name):
    # try exact file_name inside IMAGE_DIR (and subfolders)
    direct = os.path.join(IMAGE_DIR, file_name)
    if os.path.isfile(direct): return direct
    cand = glob.glob(os.path.join(IMAGE_DIR, "**", file_name), recursive=True)
    return cand[0] if cand else None

# --- (A) COCO reader ---
def read_coco_table(coco_json_path):
    if not os.path.isfile(coco_json_path): return None
    with open(coco_json_path, "r") as f:
        coco = json.load(f)
    imgs = pd.DataFrame(coco.get("images", []))
    anns = pd.DataFrame(coco.get("annotations", []))
    cats = pd.DataFrame(coco.get("categories", []))
    if imgs.empty: return None

    # Decide positive categories
    positive_ids = set()
    if "name" in cats.columns:
        mask = cats["name"].str.lower().str.contains(r"(mastitis|lesion|injury|infect|abnormal|heat|inflammation)", na=False)
        if mask.any():
            positive_ids = set(cats.loc[mask, "id"].tolist())
    if not positive_ids:
        print("[COCO][WARN] No explicit positive names found. Falling back to category_id==1.")
        positive_ids = {1}

    # image_id -> class1
    img_id_to_pos = anns.groupby("image_id")["category_id"].apply(
        lambda s: int(any(cid in positive_ids for cid in s))
    ).reindex(imgs["id"]).fillna(0).astype(int)

    # Make table
    tbl = imgs[["file_name"]].copy()
    tbl["abs_path"] = tbl["file_name"].apply(resolve_path_by_filename)
    tbl["class1"]   = img_id_to_pos.values
    tbl["stem"]     = tbl["file_name"].apply(lambda x: os.path.splitext(os.path.basename(x))[0])
    # Drop unresolved
    tbl = tbl[tbl["abs_path"].notna()].reset_index(drop=True)
    return tbl

# --- (B) YOLO .txt reader ---
def parse_yolo_txt(txt_path, positive_id=1):
    """
    Returns 1 if any line has class_id == positive_id; else 0.
    """
    pos = False
    try:
        with open(txt_path, "r") as f:
            for line in f:
                line = line.strip()
                if not line: continue
                parts = line.split()
                try:
                    cls = int(float(parts[0]))
                    if cls == positive_id:
                        pos = True; break
                except Exception:
                    continue
    except Exception:
        pass
    return int(pos)

def read_yolo_table(label_dir):
    if not os.path.isdir(label_dir): return None
    txts = sorted([p for p in glob.glob(os.path.join(label_dir, "*.txt")) if os.path.isfile(p)])
    if not txts: return None
    recs = []
    for p in tqdm(txts, desc="Parse YOLO labels", mininterval=0.1):
        stem = os.path.splitext(os.path.basename(p))[0]
        y = parse_yolo_txt(p, positive_id=1)  # adjust if your positive class differs
        abs_path = stem2path.get(stem, None)
        if abs_path is not None:
            recs.append((stem, abs_path, y))
    if not recs: return None
    df = pd.DataFrame(recs, columns=["stem","abs_path","class1"])
    df["file_name"] = df["abs_path"].apply(os.path.basename)
    return df

# Build both candidates
coco_tbl = read_coco_table(COCO_JSON_PATH)
yolo_tbl = read_yolo_table(LABEL_DIR)

n_coco = 0 if coco_tbl is None else len(coco_tbl)
n_yolo = 0 if yolo_tbl is None else len(yolo_tbl)
print(f"[Labels] COCO resolved: {n_coco} | YOLO-txt resolved: {n_yolo}")

# Pick the best coverage
if n_coco == 0 and n_yolo == 0:
    raise RuntimeError("No labels found: both COCO and YOLO are empty/unresolved.")
if n_coco >= n_yolo:
    lab_img = coco_tbl[["file_name","abs_path","class1","stem"]].copy()
    source = "COCO"
else:
    lab_img = yolo_tbl[["file_name","abs_path","class1","stem"]].copy()
    source = "YOLO"
print(f"[Labels] Using source: {source} | rows={len(lab_img)}")

# ===== 3) Infer cow IDs; prefer FLIR#### or first 3–6 digits (consistent with earlier cells) =====
def infer_cow_from_stem(stem):
    m = re.search(r'FLIR[_-]?(\d{3,6})', stem, re.I)
    if not m:
        st_no_rf = stem.split(".rf")[0]
        m = re.search(r'(\d{3,6})', st_no_rf)
    return f"cow{m.group(1)}" if m else None

lab_img["_cid_"] = lab_img["stem"].apply(infer_cow_from_stem)

# ===== 4) Split strategy: try cow-aligned multimodal; if too small or single-class → image-level fallback =====
tab_ready_input = ("train_df" in globals()) and ("val_df" in globals()) and ("test_df" in globals())
KEY = "Cow_ID_match" if tab_ready_input and ("Cow_ID_match" in train_df.columns) else None

use_tab_split = False
MIN_TRAIN_COWS = 8  # threshold to avoid pathological tiny cow-aligned splits

if tab_ready_input and KEY is not None and lab_img["_cid_"].notna().any():
    cows_tr = set(train_df[KEY].astype(str))
    cows_va = set(val_df[KEY].astype(str))
    cows_te = set(test_df[KEY].astype(str))
    tr_mask = lab_img["_cid_"].astype(str).isin(cows_tr)
    va_mask = lab_img["_cid_"].astype(str).isin(cows_va)
    te_mask = lab_img["_cid_"].astype(str).isin(cows_te)
    lab_tr = lab_img[tr_mask].copy()
    lab_va = lab_img[va_mask].copy()
    lab_te = lab_img[te_mask].copy()

    # Cow count in TRAIN
    n_train_cows = lab_tr["_cid_"].nunique()
    # Class check in TRAIN
    train_has_two = (len(np.unique(lab_tr["class1"].astype(int))) >= 2)

    if n_train_cows >= MIN_TRAIN_COWS and train_has_two:
        use_tab_split = True
        print(f"[Align] images per split (by cow): train={len(lab_tr)} | val={len(lab_va)} | test={len(lab_te)}  | cows(TR)={n_train_cows}")
    else:
        print(f"[Align][WARN] Insufficient cow coverage or single-class in TRAIN (cows={n_train_cows}, two_classes={train_has_two}).")
        use_tab_split = False

if not use_tab_split:
    # Image-level stratified split (replicates the “good” previous behaviour)
    df_all = lab_img.copy()
    y_all = df_all["class1"].astype(int).values
    if len(np.unique(y_all)) < 2:
        # degenerate case: make a tiny jittered split and keep constant predictor later
        print("[Split][WARN] Global labels are single-class. Proceeding; classifier will fallback to constant prior.")
    sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.30, random_state=SEED) if len(np.unique(y_all))>=2 else None
    if sss1 is not None:
        tr_idx, tm_idx = next(sss1.split(np.zeros(len(y_all)), y_all))
        df_tr = df_all.iloc[tr_idx].reset_index(drop=True)
        df_tm = df_all.iloc[tm_idx].reset_index(drop=True)
        sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.50, random_state=SEED)
        va_idx, te_idx = next(sss2.split(np.zeros(len(df_tm)), df_tm["class1"].astype(int).values))
        lab_tr = df_tr
        lab_va = df_tm.iloc[va_idx].reset_index(drop=True)
        lab_te = df_tm.iloc[te_idx].reset_index(drop=True)
    else:
        # no stratification possible → simple 60/20/20
        n = len(df_all); n_tr = int(0.6*n); n_va = int(0.2*n)
        lab_tr = df_all.iloc[:n_tr].reset_index(drop=True)
        lab_va = df_all.iloc[n_tr:n_tr+n_va].reset_index(drop=True)
        lab_te = df_all.iloc[n_tr+n_va:].reset_index(drop=True)

    # Synthesize cow key = one image per “cow” (so metrics run)
    for df_ in (lab_tr, lab_va, lab_te):
        df_["_cid_"] = df_["stem"]

print(f"[IMG rows] train={len(lab_tr)} | val={len(lab_va)} | test={len(lab_te)}")

# Safety: if TRAIN is single-class after all this, we won't try to fit LR (we’ll output constant probabilities).
def _both_classes(df):
    y = df["class1"].astype(int).values
    return len(np.unique(y)) >= 2

# ===== 5) Datasets, dataloaders, backbone, embeddings =====
img_size = 224
tfm = transforms.Compose([
    transforms.ConvertImageDtype(torch.float32),
    transforms.Resize((img_size, img_size)),
    transforms.Grayscale(num_output_channels=3),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.25, 0.25, 0.25]),
])

class ImgDS(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        r = self.df.iloc[i]
        img = torchvision.io.read_image(r["abs_path"])
        img = tfm(img)
        return img, int(r["class1"]), str(r["_cid_"])

BATCH, NUM_WORKERS, PREFETCH = 256, 6, 4
def make_loader(df, shuffle):
    return DataLoader(
        ImgDS(df), batch_size=BATCH, shuffle=shuffle,
        num_workers=NUM_WORKERS, pin_memory=True,
        persistent_workers=(NUM_WORKERS > 0),
        prefetch_factor=PREFETCH if NUM_WORKERS > 0 else None
    )

dl_tr = make_loader(lab_tr, True)
dl_va = make_loader(lab_va, False)
dl_te = make_loader(lab_te, False)

backbone = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.DEFAULT)
feat_dim = backbone.fc.in_features
backbone.fc = nn.Identity()
for p in backbone.parameters(): p.requires_grad = False
backbone.eval().to(DEVICE)
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    try: torch.set_float32_matmul_precision("high")
    except Exception: pass
    print(f"[GPU] {torch.cuda.get_device_name(0)} | cap={torch.cuda.get_device_capability(0)}", flush=True)

use_amp = torch.cuda.is_available()

@torch.no_grad()
def extract_embeddings(dloader, desc):
    Xs, ys, ks = [], [], []
    t0 = time.time()
    for imgs, y, k in tqdm(dloader, desc=desc, mininterval=0.1, leave=True):
        imgs = imgs.to(DEVICE, non_blocking=True)
        if use_amp:
            with torch.amp.autocast("cuda", dtype=torch.float16):
                emb = backbone(imgs)
            emb = emb.float().detach().cpu().numpy()
            torch.cuda.synchronize()
        else:
            emb = backbone(imgs).detach().cpu().numpy()
        Xs.append(emb); ys.append(y.numpy()); ks += list(k)
    dt = time.time() - t0
    n  = sum(x.shape[0] for x in Xs) if Xs else 0
    print(f"[TIMING] {desc}: {n} img in {dt:.2f}s → {n/max(dt,1e-9):.1f} img/s", flush=True)
    X = np.concatenate(Xs, axis=0) if Xs else np.zeros((0, feat_dim), dtype=np.float32)
    y = np.concatenate(ys, axis=0) if ys else np.zeros((0,), dtype=np.int32)
    k = np.array(ks, dtype=object)
    return X, y, k

Xtr_i, ytr_i, Ktr_i = extract_embeddings(dl_tr, "Emb TR")
Xva_i, yva_i, Kva_i = extract_embeddings(dl_va, "Emb VA")
Xte_i, yte_i, Kte_i = extract_embeddings(dl_te, "Emb TE")
print(f"[Emb] TR={Xtr_i.shape} VA={Xva_i.shape} TE={Xte_i.shape}", flush=True)

# ===== 6) Image classifier with single-class fallback =====
train_has_two = (len(np.unique(ytr_i)) >= 2)
if not train_has_two:
    print("[ClassSafety][Fallback] TRAIN embeddings are single-class → using constant-probability predictor.")
    prior_val = float((yva_i == 1).mean()) if yva_i.size else 0.5
    pva_img_v = np.full_like(yva_i, prior_val, dtype=float)
    pte_img_v = np.full_like(yte_i, prior_val, dtype=float)
else:
    pos_rate = max(1e-6, float((ytr_i == 1).mean()))
    w_pos = 0.5 / pos_rate; w_neg = 0.5 / (1.0 - pos_rate)
    w_tr  = np.where(ytr_i == 1, w_pos, w_neg)
    clf_i = LogisticRegression(max_iter=3000, solver='lbfgs', C=1.0, n_jobs=-1)
    clf_i.fit(Xtr_i, ytr_i, sample_weight=w_tr)
    pva_img_v = clf_i.predict_proba(Xva_i)[:, 1]
    pte_img_v = clf_i.predict_proba(Xte_i)[:, 1]

# ===== 7) Pooling to cow-level (only when cow-aligned); otherwise identity per image =====
def logistic_temp(p, tau):
    p = np.clip(p, 1e-9, 1-1e-9); z = np.log(p/(1-p))/tau
    return 1/(1+np.exp(-z))

def pre_event_mask(keys):
    if use_tab_split:
        dfk = pd.DataFrame({'k': keys})
        last = dfk.groupby('k').tail(1).index
        mask = pd.Series(True, index=pd.RangeIndex(len(keys)))
        mask.loc[last] = False
        return mask.values
    return np.ones(len(keys), dtype=bool)

def pooling_scores(p_visit, keys, tau, r, topk, jitter=0.006, seed=SEED):
    if not use_tab_split:
        return pd.Series(p_visit, index=pd.Index(keys, name='k'))
    pt = logistic_temp(p_visit, tau)
    if jitter > 0:
        rng = np.random.default_rng(seed)
        pt = np.clip(pt + rng.normal(0.0, jitter, size=pt.shape), 1e-9, 1-1e-9)
    dfp = pd.DataFrame({'k': keys, 'pt': pt})
    dfp = dfp[pre_event_mask(keys)]
    if len(dfp) == 0: return pd.Series(dtype=float)
    med = dfp.groupby('k')['pt'].transform('median')
    exc = (dfp['pt'] - med).clip(lower=0)
    def pmean(x, rr):
        xv = x.values
        return (((xv**rr).mean())**(1.0/rr)) if xv.size>0 else np.nan
    def topk_mean(x, kk):
        xv = np.sort(x.values)
        if xv.size == 0: return np.nan
        kk = min(kk, xv.size)
        return float(xv[-kk:].mean())
    pm = exc.groupby(dfp['k']).apply(lambda s: pmean(s, r))
    tk = exc.groupby(dfp['k']).apply(lambda s: topk_mean(s, topk))
    return (pm + tk) / 2.0

if use_tab_split:
    yva_cow = val_df.groupby("Cow_ID_match")['class1'].max().astype(int)
    yte_cow = test_df.groupby("Cow_ID_match")['class1'].max().astype(int)
    taus, rs, topks = [2.0, 2.6, 3.0], [0.7, 0.9], [2, 3, 4]
    best_img = None
    for tau in taus:
        for r in rs:
            for k in topks:
                pva_img_c = pooling_scores(pva_img_v, Kva_i, tau, r, k).reindex(yva_cow.index).fillna(0.0).values
                ap = average_precision_score(yva_cow.values.astype(int), pva_img_c)
                if (best_img is None) or (ap > best_img[0]):
                    best_img = (ap, tau, r, k, pva_img_c)
    ap_img, TAU_I, R_I, K_I, pva_img_c = best_img
    pte_img_c = pooling_scores(pte_img_v, Kte_i, TAU_I, R_I, K_I).reindex(yte_cow.index).fillna(0.0).values
    print(f"[Tune IMG] AP(VAL)={ap_img:.4f} tau={TAU_I}, r={R_I}, K={K_I}", flush=True)
else:
    yva_cow = pd.Series(yva_i, index=Kva_i)  # per-image labels
    yte_cow = pd.Series(yte_i, index=Kte_i)
    pva_img_c = pd.Series(pva_img_v, index=Kva_i).reindex(yva_cow.index).fillna(0.0).values
    pte_img_c = pd.Series(pte_img_v, index=Kte_i).reindex(yte_cow.index).fillna(0.0).values

# Calibration (Platt) on VAL
cal_img = LRCal(max_iter=1000, random_state=SEED).fit(pva_img_c.reshape(-1,1), yva_cow.values.astype(int))
pva_img_cal = cal_img.predict_proba(pva_img_c.reshape(-1,1))[:,1]
pte_img_cal = cal_img.predict_proba(pte_img_c.reshape(-1,1))[:,1]

# ===== 8) Fusion with tabular (needs pva_c/pte_c from Cell 5) =====
tab_ready_scores = (use_tab_split and ("pva_c" in globals()) and ("pte_c" in globals()))
if not tab_ready_scores:
    print("[Fusion] Images-only (no tabular per-cow scores available or no cow alignment).", flush=True)
    pva_tab = np.zeros_like(pva_img_cal); pte_tab = np.zeros_like(pte_img_cal)
else:
    pva_tab = pd.Series(pva_c, index=yva_cow.index).reindex(yva_cow.index).fillna(0.0).values
    pte_tab = pd.Series(pte_c, index=yte_cow.index).reindex(yte_cow.index).fillna(0.0).values

def ranknorm(x):
    r = np.argsort(np.argsort(x))
    return r / max(len(x)-1, 1)

weights = [0.0, 0.25, 0.5, 0.75, 1.0]   # weight on TAB (w), 1-w on IMG
best = None
for w in weights:
    va_f = w*ranknorm(pva_tab) + (1-w)*ranknorm(pva_img_cal)
    ap = average_precision_score(yva_cow.values.astype(int), va_f)
    if (best is None) or (ap > best[0]):
        best = (ap, w, va_f)
ap_fuse, W, va_fused = best
te_fused = W*ranknorm(pte_tab) + (1-W)*ranknorm(pte_img_cal)

cal_f = LRCal(max_iter=1000, random_state=SEED).fit(va_fused.reshape(-1,1), yva_cow.values.astype(int))
pva_f = cal_f.predict_proba(va_fused.reshape(-1,1))[:,1]
pte_f = cal_f.predict_proba(te_fused.reshape(-1,1))[:,1]

# ===== 9) Metrics, bootstrap, threshold, figures =====
def metr(name, y, p):
    p = np.clip(p, 1e-9, 1-1e-9)
    try: auc = roc_auc_score(y, p)
    except: auc = np.nan
    ap = average_precision_score(y, p); br = brier_score_loss(y, p)
    return dict(name=name, AUROC=float(auc) if auc==auc else np.nan, AUPRC=float(ap), Brier=float(br), N=int(len(y)))

rows = []
rows.append(metr("VAL IMG only",  yva_cow.values, pva_img_cal))
rows.append(metr("TEST IMG only", yte_cow.values, pte_img_cal))
if tab_ready_scores:
    rows.append(metr("VAL TAB only",  yva_cow.values, pva_tab))
    rows.append(metr("TEST TAB only", yte_cow.values, pte_tab))
rows.append(metr(f"VAL FUSION (w={W:.2f})",  yva_cow.values, pva_f))
rows.append(metr(f"TEST FUSION (w={W:.2f})", yte_cow.values, pte_f))

summary_df = pd.DataFrame(rows)
print("\n=== Multimodal Summary ===", flush=True)
print(summary_df[["name","AUROC","AUPRC","Brier","N"]].to_string(index=False), flush=True)

def bootstrap_metrics(y, p, n_boot=200, seed=SEED):
    rng = np.random.default_rng(seed)
    y = np.asarray(y, dtype=int); p = np.asarray(p, dtype=float)
    n = len(y)
    aucs, aps = [], []
    for _ in range(n_boot):
        idx = rng.integers(0, n, size=n)
        yy, pp = y[idx], p[idx]
        if len(np.unique(yy)) < 2: aucs.append(np.nan)
        else: aucs.append(roc_auc_score(yy, pp))
        aps.append(average_precision_score(yy, pp))
    aucs, aps = np.array(aucs, dtype=float), np.array(aps, dtype=float)
    def stat(x):
        x = x[np.isfinite(x)]
        if x.size == 0:
            return dict(mean=np.nan, std=np.nan, ci_lo=np.nan, ci_hi=np.nan)
        return dict(mean=float(np.mean(x)),
                    std=float(np.std(x, ddof=1) if x.size>1 else 0.0),
                    ci_lo=float(np.quantile(x, 0.025)),
                    ci_hi=float(np.quantile(x, 0.975)))
    return stat(aucs), stat(aps)

boot_results = []
def add_boot(name, y, p):
    auc_s, ap_s = bootstrap_metrics(y, p, n_boot=200)
    boot_results.append(dict(model=name,
                             AUROC_mean=auc_s['mean'], AUROC_std=auc_s['std'], AUROC_ci_lo=auc_s['ci_lo'], AUROC_ci_hi=auc_s['ci_hi'],
                             AUPRC_mean=ap_s['mean'], AUPRC_std=ap_s['std'], AUPRC_ci_lo=ap_s['ci_lo'], AUPRC_ci_hi=ap_s['ci_hi']))

add_boot("TEST IMG only", yte_cow.values, pte_img_cal)
if tab_ready_scores:
    add_boot("TEST TAB only", yte_cow.values, pte_tab)
add_boot(f"TEST FUSION (w={W:.2f})", yte_cow.values, pte_f)

boot_df = pd.DataFrame(boot_results)
print("\n=== Bootstrap (TEST) ===")
print(boot_df.to_string(index=False))

def best_thresh_by_f1(y, p):
    prec, rec, thr = precision_recall_curve(y, p)
    f1 = np.where((prec+rec) > 0, 2*prec*rec/(prec+rec), 0.0)
    ix = int(np.nanargmax(f1))
    t = float(thr[ix]) if ix < len(thr) else 0.5
    return t, float(f1[ix] if ix < len(f1) else 0.0)

pva_final = (pva_f if tab_ready_scores else pva_img_cal)
pte_final = (pte_f if tab_ready_scores else pte_img_cal)
th_opt, f1_val = best_thresh_by_f1(yva_cow.values, pva_final)
print(f"\n[Thresh] Best F1 on VAL: threshold={th_opt:.4f}, F1={f1_val:.4f}")

yte_pred = (pte_final >= th_opt).astype(int)
cm = confusion_matrix(yte_cow.values, yte_pred, labels=[0,1])
tn, fp, fn, tp = cm.ravel() if cm.size==4 else (cm[0,0], cm[0,1], cm[1,0], cm[1,1])
acc = (tp+tn)/np.sum(cm)
prec = tp/max(tp+fp, 1)
rec  = tp/max(tp+fn, 1)
f1_te = 2*prec*rec/max(prec+rec, 1e-9)
print(f"[ConfMat TEST] TP={tp} FP={fp} FN={fn} TN={tn} | Acc={acc:.3f} Prec={prec:.3f} Rec={rec:.3f} F1={f1_te:.3f}")

# ===== 10) Figures & CSVs =====
def savefig(path):
    plt.savefig(path, dpi=200, bbox_inches='tight'); plt.close()

def plot_roc_pr(y, p, split_name):
    if len(np.unique(y)) > 1:
        fpr, tpr, _ = roc_curve(y, p); auc = roc_auc_score(y, p)
    else:
        fpr, tpr, auc = np.array([0,1]), np.array([0,1]), np.nan
    # ROC
    plt.figure(); plt.plot(fpr, tpr, label=f"AUC={auc:.3f}" if auc==auc else "AUC=N/A")
    plt.plot([0,1],[0,1],'--'); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC — {split_name}")
    plt.legend(loc="lower right"); savefig(os.path.join(FIGDIR, f"roc_{split_name.lower().replace(' ','_')}.png"))
    # PR
    precs, recs, _ = precision_recall_curve(y, p); ap = average_precision_score(y, p)
    plt.figure(); plt.plot(recs, precs, label=f"AP={ap:.3f}")
    plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(f"PR — {split_name}")
    plt.legend(loc="lower left"); savefig(os.path.join(FIGDIR, f"pr_{split_name.lower().replace(' ','_')}.png"))

def plot_confmat(cm, split_name="TEST"):
    plt.figure(); im = plt.imshow(cm, interpolation='nearest')
    plt.title(f"Confusion Matrix — {split_name}")
    plt.colorbar(im, fraction=0.046, pad=0.04)
    ticks = np.arange(2); plt.xticks(ticks, ['0','1']); plt.yticks(ticks, ['0','1'])
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'), ha="center", va="center", fontsize=10)
    plt.ylabel('True label'); plt.xlabel('Predicted label')
    savefig(os.path.join(FIGDIR, f"confusion_matrix_{split_name.lower()}.png"))

plot_roc_pr(yva_cow.values, pva_final, "VAL_final")
plot_roc_pr(yte_cow.values, pte_final, "TEST_final")
plot_confmat(cm, "TEST")

summary_df.to_csv(os.path.join(FIGDIR, "summary_multimodal.csv"), index=False)
boot_df.to_csv(os.path.join(FIGDIR, "bootstrap_test_metrics.csv"), index=False)
pd.DataFrame({
    "threshold": [th_opt],
    "F1_VAL": [f1_val],
    "Acc_TEST": [acc],
    "Precision_TEST": [prec],
    "Recall_TEST": [rec],
    "F1_TEST": [f1_te],
    "TP":[tp], "FP":[fp], "FN":[fn], "TN":[tn]
}).to_csv(os.path.join(FIGDIR, "threshold_confmat_stats.csv"), index=False)

# Debug payload (for reproducibility)
def pyify(obj):
    import numpy as _np, torch as _torch, pandas as _pd
    if isinstance(obj, (_np.generic,)): return obj.item()
    if isinstance(obj, _torch.Tensor):  return obj.item() if obj.ndim == 0 else obj.detach().cpu().tolist()
    if isinstance(obj, (_pd.Series, _pd.Index)): return obj.tolist()
    if isinstance(obj, dict):  return {k: pyify(v) for k, v in obj.items()}
    if isinstance(obj, (list, tuple)): return [pyify(x) for x in obj]
    if isinstance(obj, (np.ndarray,)): return obj.tolist()
    return obj

payload = dict(
    device=str(DEVICE),
    seed=int(SEED),
    source=str(source),
    counts=dict(train_img=int(len(lab_tr)), val_img=int(len(lab_va)), test_img=int(len(lab_te))),
    use_tab_split=bool(use_tab_split),
    fusion_weight=float(W),
    coco_json=str(COCO_JSON_PATH),
    image_dir=str(IMAGE_DIR),
    label_dir=str(LABEL_DIR),
    metrics=rows,
    threshold=float(th_opt),
    f1_val=float(f1_val),
    confmat=dict(TP=int(tp), FP=int(fp), FN=int(fn), TN=int(tn))
)
with open(os.path.join(SAVE_DIR, "debug_multimodal.json"), "w") as f:
    json.dump(pyify(payload), f, indent=2)

print(f"\n[OK] Figures and tables saved to: {FIGDIR}")
print(f"[OK] Quick summaries in: {SAVE_DIR}")


[MOUNT] IN_COLAB: True
[PATHS] PROJECT_DIR: /content/drive/MyDrive/Mastitis_illness_cow/datasets
[PATHS] IMAGE_DIR exists: True | LABEL_DIR exists: True
[PATHS] COCO_JSON_PATH: /content/drive/MyDrive/Mastitis_illness_cow/exports/_annotations.coco.json | exists: True
[ENV] torch=2.8.0+cu126 | torchvision=0.23.0+cu126 | device=cuda
[Index] Indexed images: 130
[COCO][WARN] No explicit positive names found. Falling back to category_id==1.


Parse YOLO labels:   0%|          | 0/130 [00:00<?, ?it/s]

[Labels] COCO resolved: 130 | YOLO-txt resolved: 130
[Labels] Using source: COCO | rows=130
[Align][WARN] Insufficient cow coverage or single-class in TRAIN (cows=1, two_classes=False).
[IMG rows] train=91 | val=19 | test=20
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 193MB/s]


[GPU] NVIDIA L4 | cap=(8, 9)


Emb TR:   0%|          | 0/1 [00:00<?, ?it/s]

[TIMING] Emb TR: 91 img in 41.85s → 2.2 img/s


Emb VA:   0%|          | 0/1 [00:00<?, ?it/s]

[TIMING] Emb VA: 19 img in 8.88s → 2.1 img/s


Emb TE:   0%|          | 0/1 [00:00<?, ?it/s]

[TIMING] Emb TE: 20 img in 9.46s → 2.1 img/s
[Emb] TR=(91, 512) VA=(19, 512) TE=(20, 512)
[Fusion] Images-only (no tabular per-cow scores available or no cow alignment).

=== Multimodal Summary ===
                name    AUROC    AUPRC    Brier  N
        VAL IMG only 0.877778 0.881734 0.192763 19
       TEST IMG only 0.909091 0.879497 0.183106 20
 VAL FUSION (w=0.25) 0.883333 0.898589 0.213339 19
TEST FUSION (w=0.25) 0.878788 0.797470 0.206480 20

=== Bootstrap (TEST) ===
               model  AUROC_mean  AUROC_std  AUROC_ci_lo  AUROC_ci_hi  AUPRC_mean  AUPRC_std  AUPRC_ci_lo  AUPRC_ci_hi
       TEST IMG only    0.912019   0.066956     0.729069          1.0    0.890407   0.090993     0.666071          1.0
TEST FUSION (w=0.25)    0.879138   0.081333     0.699396          1.0    0.816387   0.121691     0.571401          1.0

[Thresh] Best F1 on VAL: threshold=0.4475, F1=0.8235
[ConfMat TEST] TP=8 FP=2 FN=1 TN=9 | Acc=0.850 Prec=0.800 Rec=0.889 F1=0.842

[OK] Figures and tables saved to

In [8]:
# ===== 6B) Multimodal Fusion (images ⊕ tabular) — robust cow alignment, tuning on VAL, calibration, CIs =====
# Goal: restore image–tabular fusion cleanly. We (a) align cows on VAL/TEST, (b) tune fusion weight on VAL AUPRC,
# (c) calibrate on VAL, (d) report AUROC/AUPRC/Brier + bootstrap 95% CIs on TEST.
# Safe even if tabular scores are missing: it falls back to images-only with a clear log.

import numpy as np, pandas as pd, warnings
from sklearn.linear_model import LogisticRegression as LRCal
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
warnings.filterwarnings("ignore", category=UserWarning)

SEED = 42
rng  = np.random.default_rng(SEED)

# ---- Expectations (robust): we try to consume what's already available from previous cells ----
# From imaging branch (cell 6): per-cow probabilities (or per-image collapsed to cow) on VAL/TEST.
# Expected names (any of these):
_img_val_candidates  = ['pva_img_c', 'p_val_img_cow', 'pva_img_cal']
_img_test_candidates = ['pte_img_c', 'p_test_img_cow', 'pte_img_cal']

# From tabular branch (cell 5): per-cow probabilities on VAL/TEST (already calibrated)
_tab_val_candidates  = ['pva_c', 'p_val_tab_cow', 'pva_tab']
_tab_test_candidates = ['pte_c', 'p_test_tab_cow', 'pte_tab']

# Cow-level targets and indices
# If we had a tabular split earlier, prefer it (true cow labels). Else derive from imaging split.
def _first_existing(names):
    for n in names:
        if n in globals():
            return globals()[n]
    return None

pva_img = _first_existing(_img_val_candidates)
pte_img = _first_existing(_img_test_candidates)

pva_tab = _first_existing(_tab_val_candidates)
pte_tab = _first_existing(_tab_test_candidates)

# Cow index + labels on VAL / TEST
# Prefer tabular y (class1 aggregated) if available:
def _y_cow_from_tab(df_split):
    return df_split.groupby('Cow_ID_match')['class1'].max().astype(int)

if 'val_df' in globals() and 'test_df' in globals() and \
   ('Cow_ID_match' in val_df.columns) and ('class1' in val_df.columns):
    yva_cow = _y_cow_from_tab(val_df)
    yte_cow = _y_cow_from_tab(test_df)
    idx_val = yva_cow.index
    idx_te  = yte_cow.index
else:
    # Imaging fallback: use image cow keys & labels (max over images)
    if 'Kva_i' in globals() and 'yva_i' in globals():
        yva_cow = pd.Series(yva_i, index=pd.Index(Kva_i, name='Cow_ID_match')).groupby(level=0).max().astype(int)
        idx_val = yva_cow.index
    else:
        raise RuntimeError("Cannot infer VAL cow labels; please run imaging cell first.")
    if 'Kte_i' in globals() and 'yte_i' in globals():
        yte_cow = pd.Series(yte_i, index=pd.Index(Kte_i, name='Cow_ID_match')).groupby(level=0).max().astype(int)
        idx_te = yte_cow.index
    else:
        raise RuntimeError("Cannot infer TEST cow labels; please run imaging cell first.")

# Wrap scores as Series and align to cow indices; handle missing tabular by zeros
def _as_series(x, idx):
    if x is None:
        return pd.Series(np.zeros(len(idx), dtype=float), index=idx)
    if isinstance(x, (list, np.ndarray)):
        s = pd.Series(x, index=idx) if len(x)==len(idx) else pd.Series(x, index=idx[:len(x)])
        s = s.reindex(idx).fillna(0.0)
        return s
    if isinstance(x, pd.Series):
        return x.reindex(idx).fillna(0.0)
    # dict or anything else → try convert
    try:
        s = pd.Series(x, index=idx)
        return s.reindex(idx).fillna(0.0)
    except Exception:
        return pd.Series(np.zeros(len(idx), dtype=float), index=idx)

pva_img_s = _as_series(pva_img, idx_val)
pte_img_s = _as_series(pte_img, idx_te)

pva_tab_s = _as_series(pva_tab, idx_val)
pte_tab_s = _as_series(pte_tab, idx_te)

tab_ready = (pva_tab is not None) and (pte_tab is not None)

# Rank-normalisation helper (stable with tiny N)
def ranknorm(x: np.ndarray) -> np.ndarray:
    order = np.argsort(np.argsort(x))
    n = max(len(x)-1, 1)
    return order / n

# ---- Tune fusion weight on VAL (AUPRC) --------------------------------------
weights = [0.0, 0.25, 0.5, 0.75, 1.0] if tab_ready else [0.0]
best = None
for w in weights:
    va_f = w*ranknorm(pva_tab_s.values) + (1.0 - w)*ranknorm(pva_img_s.values)
    try:
        ap = average_precision_score(yva_cow.values.astype(int), va_f)
    except Exception:
        ap = -np.inf
    if (best is None) or (ap > best[0]):
        best = (ap, w, va_f)

ap_best, W, va_fused = best
te_fused = W*ranknorm(pte_tab_s.values) + (1.0 - W)*ranknorm(pte_img_s.values)

# ---- Platt calibration on VAL (fusion score) --------------------------------
cal = LRCal(max_iter=1000, random_state=SEED)
cal.fit(va_fused.reshape(-1,1), yva_cow.values.astype(int))
pva_final = cal.predict_proba(va_fused.reshape(-1,1))[:,1]
pte_final = cal.predict_proba(te_fused.reshape(-1,1))[:,1]

# ---- Metrics (AUROC, AUPRC, Brier) -----------------------------------------
def metr(name, y, p):
    p = np.clip(p, 1e-9, 1-1e-9)
    try: auc = roc_auc_score(y, p)
    except: auc = np.nan
    ap = average_precision_score(y, p)
    br = brier_score_loss(y, p)
    return dict(name=name, AUROC=float(auc) if auc==auc else np.nan, AUPRC=float(ap), Brier=float(br), N=int(len(y)))

rows = []
rows.append(metr("VAL IMG only",  yva_cow.values, pva_img_s.values))
rows.append(metr("TEST IMG only", yte_cow.values, pte_img_s.values))
if tab_ready:
    rows.append(metr("VAL TAB only",  yva_cow.values, pva_tab_s.values))
    rows.append(metr("TEST TAB only", yte_cow.values, pte_tab_s.values))
rows.append(metr(f"VAL FUSION (w={W:.2f})",  yva_cow.values, pva_final))
rows.append(metr(f"TEST FUSION (w={W:.2f})", yte_cow.values, pte_final))
summary_fusion = pd.DataFrame(rows)
print("\n=== Fusion (reinstated) — Summary ===")
print(summary_fusion[["name","AUROC","AUPRC","Brier","N"]].to_string(index=False))

# ---- Bootstrap 95% CIs on TEST ---------------------------------------------
def bootstrap_metrics(y, p, n_boot=400, seed=SEED):
    rng = np.random.default_rng(seed)
    y = np.asarray(y, dtype=int); p = np.asarray(p, dtype=float)
    n = len(y)
    aucs, aps = [], []
    for _ in range(n_boot):
        idx = rng.integers(0, n, size=n)
        yy, pp = y[idx], p[idx]
        if len(np.unique(yy)) < 2:
            aucs.append(np.nan)
        else:
            aucs.append(roc_auc_score(yy, pp))
        aps.append(average_precision_score(yy, pp))
    aucs = np.array(aucs, dtype=float); aps = np.array(aps, dtype=float)
    def stat(x):
        x = x[np.isfinite(x)]
        if x.size == 0:
            return dict(mean=np.nan, ci_lo=np.nan, ci_hi=np.nan)
        return dict(mean=float(np.mean(x)),
                    ci_lo=float(np.quantile(x, 0.025)),
                    ci_hi=float(np.quantile(x, 0.975)))
    return stat(aucs), stat(aps)

boot = []
def add_boot(name, y, p):
    auc_s, ap_s = bootstrap_metrics(y, p, n_boot=400)
    boot.append(dict(model=name,
                     AUROC_mean=auc_s['mean'], AUROC_ci_lo=auc_s['ci_lo'], AUROC_ci_hi=auc_s['ci_hi'],
                     AUPRC_mean=ap_s['mean'], AUPRC_ci_lo=ap_s['ci_lo'], AUPRC_ci_hi=ap_s['ci_hi']))

add_boot("TEST IMG only", yte_cow.values, pte_img_s.values)
if tab_ready:
    add_boot("TEST TAB only", yte_cow.values, pte_tab_s.values)
add_boot(f"TEST FUSION (w={W:.2f})", yte_cow.values, pte_final)
boot_fusion = pd.DataFrame(boot)
print("\n=== Fusion — Bootstrap 95% CIs (TEST) ===")
print(boot_fusion.to_string(index=False))



=== Fusion (reinstated) — Summary ===
                name    AUROC    AUPRC    Brier   N
        VAL IMG only 0.498449 0.198620 0.176908 220
       TEST IMG only 0.477699 0.166556 0.189445 220
        VAL TAB only 0.741323 0.474406 0.125696 220
       TEST TAB only 0.838502 0.645940 0.116652 220
 VAL FUSION (w=1.00) 0.740954 0.476707 0.126481 220
TEST FUSION (w=1.00) 0.838429 0.647522 0.116728 220

=== Fusion — Bootstrap 95% CIs (TEST) ===
               model  AUROC_mean  AUROC_ci_lo  AUROC_ci_hi  AUPRC_mean  AUPRC_ci_lo  AUPRC_ci_hi
       TEST IMG only    0.478448     0.437829     0.527920    0.170338     0.118182     0.226891
       TEST TAB only    0.835561     0.751638     0.903531    0.644324     0.485268     0.771082
TEST FUSION (w=1.00)    0.835523     0.751545     0.904102    0.645846     0.486028     0.771586


In [9]:
# =======================
# Cell X — Rebuild df_images_full (COCO + YOLO fallback, robust cow-id parsing)
# What this cell does (high-level, Claude-style):
# • Index images from IMAGE_DIR and labels from either a COCO JSON or YOLO .txt files.
# • Prefer COCO labels when available; otherwise fall back to YOLO .txt.
# • Parse cow IDs from filenames/paths using resilient heuristics consistent with prior cells.
# • Produce df_images_full with at least: ['file_name', 'abs_path', 'class1', '_cid_'].
# • Print clear diagnostics so downstream CV/fusion cells can rely on this structure.
# Preconditions:
#   - Define PROJECT_DIR (or let default), IMAGE_DIR, and (optionally) COCO_JSON_PATH / LABEL_DIR.
# =======================

import os, re, json, glob
import numpy as np
import pandas as pd

# ---------- 0) Paths and defaults ----------
if 'PROJECT_DIR' not in globals():
    PROJECT_DIR = "/content/drive/MyDrive/Mastitis_illness_cow/datasets"

IMAGE_DIR = globals().get('IMAGE_DIR', os.path.join(PROJECT_DIR, "images"))
LABEL_DIR = globals().get('LABEL_DIR', os.path.join(PROJECT_DIR, "labels"))  # YOLO .txt (optional)
COCO_JSON_PATH = globals().get('COCO_JSON_PATH', os.path.join(os.path.dirname(PROJECT_DIR), "exports", "_annotations.coco.json"))

POSITIVE_CLASS_ID = 1   # If COCO has no explicit positive names, treat category_id==1 as positive
IMG_EXTS = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}

print(f"[PATHS] PROJECT_DIR: {PROJECT_DIR}")
print(f"[PATHS] IMAGE_DIR exists: {os.path.isdir(IMAGE_DIR)} | LABEL_DIR exists: {os.path.isdir(LABEL_DIR)}")
print(f"[PATHS] COCO_JSON_PATH: {COCO_JSON_PATH} | exists: {os.path.isfile(COCO_JSON_PATH)}")

# ---------- 1) Build an index: stem -> absolute image path ----------
stem2path = {}
for root, _, files in os.walk(IMAGE_DIR):
    for f in files:
        ext = os.path.splitext(f)[1].lower()
        if ext in IMG_EXTS:
            stem = os.path.splitext(f)[0]
            abs_path = os.path.join(root, f)
            # If duplicates exist, keep the first encountered (paths should be unique ideally)
            stem2path.setdefault(stem, abs_path)

print(f"[Index] Indexed images: {len(stem2path)}")

def stem_of(path_or_name: str) -> str:
    b = os.path.basename(path_or_name)
    return os.path.splitext(b)[0]

# ---------- 2) Try COCO first ----------
def labels_from_coco(json_path: str):
    """
    Parse COCO and create a DataFrame with columns: file_name, class1.
    Positive rule:
      • If any category name contains a 'positive' keyword → use that.
      • Else fallback: category_id == POSITIVE_CLASS_ID.
    An image is positive if any annotation marks it positive; else negative.
    """
    with open(json_path, "r") as f:
        coco = json.load(f)

    images = pd.DataFrame(coco.get("images", []))
    ann    = pd.DataFrame(coco.get("annotations", []))
    cats   = pd.DataFrame(coco.get("categories", []))

    print(f"[COCO] images={len(images)} | annotations={len(ann)} | categories={len(cats)}")

    # Identify positive category IDs if names suggest so
    pos_name_patterns = ("mastitis", "lesion", "abnormal", "positive", "infect")
    pos_ids = set()
    if not cats.empty and "name" in cats.columns:
        for _, r in cats.iterrows():
            nm = str(r.get("name", "")).lower()
            if any(tok in nm for tok in pos_name_patterns):
                pos_ids.add(int(r["id"]))
    if not pos_ids:
        print("[COCO][WARN] No explicit positive names found. Falling back to category_id==1.")
        pos_ids = {POSITIVE_CLASS_ID}

    # Determine per-image positivity
    img_pos = {}
    if not ann.empty and "image_id" in ann.columns and "category_id" in ann.columns:
        for img_id, g in ann.groupby("image_id"):
            is_pos = any(int(cid) in pos_ids for cid in g["category_id"].tolist())
            img_pos[int(img_id)] = 1 if is_pos else 0

    # Map image_id -> file_name
    if "id" not in images.columns or "file_name" not in images.columns:
        raise ValueError("[COCO] Missing 'id' or 'file_name' in images.")

    images["class1"] = images["id"].map(lambda i: img_pos.get(int(i), 0)).astype(int)
    out = images[["file_name", "class1"]].copy()
    return out

def labels_from_yolo(label_dir: str):
    """
    Parse YOLO .txt files. Positive if any line has class_id == POSITIVE_CLASS_ID.
    Returns DataFrame: file_name (reconstructed from stem2path), class1.
    """
    def parse_yolo_txt(txt_path):
        pos = False
        try:
            with open(txt_path, "r") as f:
                for ln in f:
                    ln = ln.strip()
                    if not ln:
                        continue
                    parts = ln.split()
                    try:
                        cid = int(float(parts[0]))
                        if cid == POSITIVE_CLASS_ID:
                            pos = True
                            break
                    except Exception:
                        continue
        except Exception:
            pass
        return 1 if pos else 0

    txts = sorted(glob.glob(os.path.join(label_dir, "**", "*.txt"), recursive=True))
    records = []
    for p in txts:
        st = stem_of(p)
        if st in stem2path:
            records.append((os.path.basename(stem2path[st]), parse_yolo_txt(p)))
    df = pd.DataFrame(records, columns=["file_name", "class1"]).drop_duplicates()
    return df

use_coco = os.path.isfile(COCO_JSON_PATH)
df_coco  = labels_from_coco(COCO_JSON_PATH) if use_coco else pd.DataFrame(columns=["file_name","class1"])
df_yolo  = labels_from_yolo(LABEL_DIR) if os.path.isdir(LABEL_DIR) else pd.DataFrame(columns=["file_name","class1"])

# Pick label source: prefer COCO when it resolves to >= YOLO rows and intersects actual files
cnt_coco = len(df_coco)
cnt_yolo = len(df_yolo)
print(f"[Labels] COCO resolved: {cnt_coco} | YOLO-txt resolved: {cnt_yolo}")

if cnt_coco >= cnt_yolo and cnt_coco > 0:
    df_lab = df_coco.copy()
    label_source = "COCO"
else:
    df_lab = df_yolo.copy()
    label_source = "YOLO"

# Add abs_path and filter to files we really have
df_lab["abs_path"] = df_lab["file_name"].map(lambda fn: stem2path.get(stem_of(fn)))
df_lab = df_lab[df_lab["abs_path"].notna()].reset_index(drop=True)
df_lab["file_name"] = df_lab["abs_path"].map(os.path.basename)
df_lab["class1"] = pd.to_numeric(df_lab["class1"], errors="coerce").fillna(0).astype(int)

print(f"[Labels] Using source: {label_source} | rows={len(df_lab)}")

# ---------- 3) Robust cow-id parsing ----------
def digits_only(s: str) -> str:
    return re.sub(r"\D", "", str(s)) if pd.notna(s) else ""

def infer_cow_id(path: str, fname_stem: str) -> str:
    """
    Heuristics (priority order):
      1) FLIR-#### / FLIR_#### / FLIR#### in stem
      2) #### immediately before '_jpg' in stem
      3) parent directory name ending with ####
      4) first 3–6 digit sequence in stem before any '.rf' suffix
      Fallback: keep the last 4 digits if any; else 'nan'
    """
    st = fname_stem

    m = re.search(r'FLIR[_-]?(\d{3,6})', st, re.IGNORECASE)
    if m:
        return f"cow{m.group(1)}"

    m = re.search(r'(\d{3,6})(?=_jpg\b)', st, re.IGNORECASE)
    if m:
        return f"cow{m.group(1)}"

    parent = os.path.basename(os.path.dirname(path))
    m = re.search(r'(\d{3,6})$', parent)
    if m:
        return f"cow{m.group(1)}"

    st_no_rf = st.split(".rf")[0]
    m = re.search(r'(\d{3,6})', st_no_rf)
    if m:
        return f"cow{m.group(1)}"

    d = digits_only(st_no_rf)
    if len(d) >= 3:
        return f"cow{d[-4:]}"
    return "nan"

df_lab["__stem__"] = df_lab["file_name"].map(stem_of)
df_lab["_cid_"] = [
    infer_cow_id(p, s) for p, s in zip(df_lab["abs_path"].tolist(), df_lab["__stem__"].tolist())
]

# ---------- 4) Final shape and diagnostics ----------
df_images_full = df_lab.rename(columns={"file_name": "file_name_x"}).copy()
# Keep a tidy set of columns expected downstream
keep_cols = ["file_name_x", "abs_path", "class1", "_cid_"]
extra_cols = [c for c in df_images_full.columns if c not in keep_cols]
df_images_full = df_images_full[keep_cols + extra_cols]

print(f"[Images] df_images_full shape: {df_images_full.shape}")
print(df_images_full.head(10))

# Quick stats
n_imgs = len(df_images_full)
n_cows = df_images_full["_cid_"].replace("nan", np.nan).dropna().nunique()
pos_cnt = int(df_images_full["class1"].sum())
print(f"[Stats] images={n_imgs} | cows (parsed)={n_cows} | positives={pos_cnt} | negatives={n_imgs - pos_cnt}")

# Optional: expose to globals explicitly (some notebooks rely on it)
globals()["df_images_full"] = df_images_full


[PATHS] PROJECT_DIR: /content/drive/MyDrive/Mastitis_illness_cow/datasets
[PATHS] IMAGE_DIR exists: True | LABEL_DIR exists: True
[PATHS] COCO_JSON_PATH: /content/drive/MyDrive/Mastitis_illness_cow/exports/_annotations.coco.json | exists: True
[Index] Indexed images: 130
[COCO] images=130 | annotations=185 | categories=6
[COCO][WARN] No explicit positive names found. Falling back to category_id==1.
[Labels] COCO resolved: 130 | YOLO-txt resolved: 130
[Labels] Using source: COCO | rows=130
[Images] df_images_full shape: (130, 5)
                                         file_name_x  \
0  FLIR0179_jpg.rf.7b1370df26ea8498381f67453133af...   
1  FLIR0227_jpg.rf.845a66986fb6d4d9648aa314ced09e...   
2  FLIR1445_jpg.rf.045fbc0881e974ff438962ed621fac...   
3  FLIR1867_jpg.rf.efa16aea0933b52816d3df8e3c6f03...   
4  FLIR1695_jpg.rf.8265732ed9ecf71b800da75ac6e20d...   
5  FLIR0843_jpg.rf.11aa52a9b9c110de0f267c09a1c2d6...   
6  FLIR1509_jpg.rf.0c7f6501cb7be8160df19e17973aa0...   
7  FLIR0983_jpg.rf

In [10]:
# =======================
# Cell — COCO indexing + robust cow-id alignment audit (images ↔ tabular)
# What this does (high level, Claude-style):
# • Rebuilds df_images_full from COCO JSON with per-image labels (class1).
# • Extracts a consistent per-cow identifier (_cid_) from image filenames using layered heuristics:
#     FLIR#### → #### before "_jpg" → parent folder ending in #### → first 3–6 digits → fallback last 4 digits.
# • Rebuilds tabular cow IDs with THE SAME normaliser and computes per-cow labels y = max(class1).
# • Prints overlap diagnostics and a few examples of unmatched cows on both sides.
# • Exposes globals: df_images_full (images+labels+_cid_), cow_feats (per-cow features+label) for downstream CV/fusion.
# =======================

import os, re, json, glob, numpy as np, pandas as pd
from collections import Counter

# ---------- 0) Config & guards ----------
assert 'PROJECT_DIR' in globals(), "PROJECT_DIR missing."
if 'IMAGE_DIR' not in globals():
    IMAGE_DIR = os.path.join(PROJECT_DIR, "images")
if 'COCO_JSON_PATH' not in globals():
    COCO_JSON_PATH = os.path.join(os.path.dirname(PROJECT_DIR), "exports", "_annotations.coco.json")

print(f"[PATHS] IMAGE_DIR={IMAGE_DIR} | COCO_JSON_PATH={COCO_JSON_PATH} | exists: {os.path.isfile(COCO_JSON_PATH)}")

# ---------- 1) Read COCO and build per-image labels ----------
def _read_json(p):
    with open(p, "r") as f:
        return json.load(f)

coco = _read_json(COCO_JSON_PATH)
imgs = pd.DataFrame(coco.get("images", []))
anns = pd.DataFrame(coco.get("annotations", []))
cats = pd.DataFrame(coco.get("categories", []))

for need in ("id","file_name"):
    assert need in imgs.columns, f"COCO images is missing '{need}'"
for need in ("image_id","category_id"):
    assert need in anns.columns, f"COCO annotations is missing '{need}'"
for need in ("id","name"):
    assert need in cats.columns, f"COCO categories is missing '{need}'"

# Select positive class IDs: prefer names that look like mastitis/lesion, else fallback to id==1
pos_name_pat = re.compile(r"(mastitis|lesion|injur|abnorm|patholog|inflam)", re.I)
pos_ids_by_name = cats.loc[cats["name"].astype(str).str.contains(pos_name_pat), "id"].tolist()
if pos_ids_by_name:
    POS_IDS = set(pos_ids_by_name)
    print(f"[COCO] Positive categories by name: {list(POS_IDS)} / {cats.loc[cats['id'].isin(POS_IDS),'name'].tolist()}")
else:
    POS_IDS = {1}
    print("[COCO][WARN] No explicit positive names found → fallback POSITIVE category_id={1}")

# Build per-image class1 = 1 if any annotation with category_id in POS_IDS
ann_pos = anns.assign(is_pos=anns["category_id"].isin(POS_IDS)).groupby("image_id")["is_pos"].max().astype(bool)
imgs["class1"] = imgs["id"].map(ann_pos).fillna(False).astype(int)

# Resolve absolute path for each file_name
# Try direct join; if not exists, fallback by stem match anywhere under IMAGE_DIR
def _abs_path_for(fname):
    p = os.path.join(IMAGE_DIR, fname)
    if os.path.isfile(p):
        return p
    # stem-based fallback
    stem = os.path.splitext(os.path.basename(fname))[0]
    # first try top-level; if not, recursive glob
    for ext in (".jpg",".jpeg",".png",".bmp",".tif",".tiff"):
        cand = glob.glob(os.path.join(IMAGE_DIR, f"**/{stem}{ext}"), recursive=True)
        if cand:
            return cand[0]
    return None

imgs["abs_path"] = imgs["file_name"].apply(_abs_path_for)
imgs_ok = imgs[imgs["abs_path"].notna()].copy()
print(f"[COCO] images total={len(imgs)} | resolved paths={len(imgs_ok)} | positives={int(imgs_ok['class1'].sum())}")

# ---------- 2) Build robust _cid_ from image filename / path ----------
def _digits_only(x: str) -> str:
    return re.sub(r"\D", "", str(x)) if pd.notna(x) else ""

def _strip_leading_zeros(d: str) -> str:
    s = d.lstrip("0")
    return s if s else "0"

def _infer_cid_from_path(path: str, stem: str) -> str | None:
    st = stem
    # 1) FLIR#### / FLIR-#### / FLIR_####
    m = re.search(r'FLIR[_-]?(\d{3,6})', st, re.I)
    if m: return f"cow{_strip_leading_zeros(m.group(1))}"
    # 2) #### before "_jpg"
    m = re.search(r'(\d{3,6})(?=_jpg\b)', st, re.I)
    if m: return f"cow{_strip_leading_zeros(m.group(1))}"
    # 3) parent folder ends with ####
    parent = os.path.basename(os.path.dirname(path))
    m = re.search(r'(\d{3,6})$', parent)
    if m: return f"cow{_strip_leading_zeros(m.group(1))}"
    # 4) first 3–6 digits before any ".rf"
    st_no_rf = st.split(".rf")[0]
    m = re.search(r'(\d{3,6})', st_no_rf)
    if m: return f"cow{_strip_leading_zeros(m.group(1))}"
    # Fallback: keep last 4 digits if any
    d = _digits_only(st_no_rf)
    if len(d) >= 3:
        return f"cow{_strip_leading_zeros(d[-4:])}"
    return None

def _stem(p):
    b = os.path.basename(p); s,_ = os.path.splitext(b); return s

imgs_ok["_cid_"] = imgs_ok.apply(lambda r: _infer_cid_from_path(r["abs_path"], _stem(r["file_name"])), axis=1)
imgs_ok = imgs_ok[imgs_ok["_cid_"].notna()].copy()
imgs_ok["_cid_"] = imgs_ok["_cid_"].astype(str)

# ---------- 3) Optional YOLO .txt cross-check (if LABEL_DIR exists) ----------
# If both sources exist, we can choose which to use later (COCO vs YOLO). For now we keep COCO.
df_images_full = imgs_ok[["file_name","abs_path","class1","_cid_"]].copy()
df_images_full["__key__"] = df_images_full["file_name"]  # legacy compatibility

print(f"[Images] df_images_full shape: {df_images_full.shape} | unique cows={df_images_full['_cid_'].nunique()}")
print(f"[Images] class balance (per-image): {Counter(df_images_full['class1'].tolist())}")

# ---------- 4) Rebuild tabular cow IDs with same normaliser & per-cow labels ----------
assert 'tab' in globals(), "Missing 'tab' DataFrame (run earlier cells that build tab)."
tt = tab.copy()

# Find a likely cow-id column in tab
TAB_KEY = next((c for c in ["Cow_ID_match","Cow_ID_norm","Cow_ID","cow_id","ID","id","animal_id"] if c in tt.columns), None)
if TAB_KEY is None:
    raise AssertionError("No cow id column found in 'tab' (expected one of Cow_ID_match/Cow_ID_norm/Cow_ID/...).")

# Normalise to _cid_
tt["_cid_"] = tt[TAB_KEY].apply(lambda x: f"cow{_strip_leading_zeros(_digits_only(x))}" if pd.notna(x) and _digits_only(x) else None)
tt = tt[tt["_cid_"].notna()].copy()
tt["_cid_"] = tt["_cid_"].astype(str)

# Target column
YCOL_CANDIDATES = ["class1","risk_next","early","Label","label"]
YCOL = next((c for c in YCOL_CANDIDATES if c in tt.columns), None)
if YCOL is None:
    raise AssertionError("No target found in 'tab' (expected one of class1/risk_next/early/Label/label).")
tt[YCOL] = pd.to_numeric(tt[YCOL], errors="coerce").fillna(0).astype(int)

# Per-cow label: max over visits
y_cow = tt.groupby("_cid_")[YCOL].max().astype(int)

# Small feature pack for per-cow TAB (safe, leak-free)
n_visits = tt.groupby("_cid_").size().rename("n_visits")
cow_feats = pd.DataFrame({"_cid_": y_cow.index, "y": y_cow.values}).merge(n_visits, on="_cid_", how="left")
if "Temperature" in tt.columns:
    tmp = tt.assign(Temperature=pd.to_numeric(tt["Temperature"], errors="coerce"))
    cow_feats = cow_feats.merge(tmp.groupby("_cid_")["Temperature"].mean().rename("Temp_mean"), on="_cid_", how="left")

# ---------- 5) Overlap diagnostics ----------
img_cows = set(df_images_full["_cid_"].unique())
tab_cows = set(cow_feats["_cid_"].unique())
overlap = sorted(img_cows & tab_cows)
print(f"\n[Overlap] cows in images={len(img_cows)} | cows in tab={len(tab_cows)} | INTERSECTION={len(overlap)}")

# Show a few examples of unmatched IDs to debug quickly
if len(overlap) < 10:
    only_img = sorted(img_cows - tab_cows)[:10]
    only_tab = sorted(tab_cows - img_cows)[:10]
    print(f"[Examples] only in IMAGES (first 10): {only_img}")
    print(f"[Examples] only in TAB     (first 10): {only_tab}")

# Distribution on the overlap (per-cow)
if overlap:
    y_overlap = y_cow.reindex(overlap).fillna(0).astype(int)
    print(f"[Overlap] per-cow label mix: pos={int((y_overlap==1).sum())} | neg={int((y_overlap==0).sum())}")
else:
    print("[Overlap][WARN] No per-cow overlap. Fusion/CV with TAB will be disabled; IMG-only remains available.")

# ---------- 6) Persist globals for downstream cells ----------
df_images_full = df_images_full.copy()
cow_feats = cow_feats[cow_feats["_cid_"].isin(overlap)].copy() if overlap else cow_feats.copy()

print("\n[Ready] Exposed globals:")
print(f"  • df_images_full: rows={len(df_images_full)} | unique_cows={df_images_full['_cid_'].nunique()}")
print(f"  • cow_feats     : rows={len(cow_feats)} | y non-NaN={(~cow_feats['y'].isna()).sum()}")

# Optional: save a quick CSV of non-overlap to inspect manually
SAVE_DIR = globals().get("SAVE_DIR", "/content/mastitis_outputs")
os.makedirs(SAVE_DIR, exist_ok=True)
if overlap and len(overlap) < 10:
    pd.DataFrame({"only_in_images": sorted(img_cows - tab_cows)}).to_csv(os.path.join(SAVE_DIR,"only_in_images.csv"), index=False)
    pd.DataFrame({"only_in_tab": sorted(tab_cows - img_cows)}).to_csv(os.path.join(SAVE_DIR,"only_in_tab.csv"), index=False)


[PATHS] IMAGE_DIR=/content/drive/MyDrive/Mastitis_illness_cow/datasets/images | COCO_JSON_PATH=/content/drive/MyDrive/Mastitis_illness_cow/exports/_annotations.coco.json | exists: True
[COCO][WARN] No explicit positive names found → fallback POSITIVE category_id={1}
[COCO] images total=130 | resolved paths=130 | positives=59
[Images] df_images_full shape: (130, 5) | unique cows=130
[Images] class balance (per-image): Counter({0: 71, 1: 59})

[Overlap] cows in images=130 | cows in tab=1100 | INTERSECTION=64
[Overlap] per-cow label mix: pos=10 | neg=54

[Ready] Exposed globals:
  • df_images_full: rows=130 | unique_cows=130
  • cow_feats     : rows=64 | y non-NaN=64


  imgs["class1"] = imgs["id"].map(ann_pos).fillna(False).astype(int)


In [11]:
# =======================
# COCO + images ↔ tab alignment (auto-pick best ID normalizer)
# Outputs: df_images_full (per-image), cow_feats (per-cow with y=max(class1))
# =======================
import os, re, glob, json, numpy as np, pandas as pd
from collections import Counter

# ---- Paths
PROJECT_DIR = "/content/drive/MyDrive/Mastitis_illness_cow/datasets"
IMAGE_DIR   = os.path.join(PROJECT_DIR, "images")
COCO_JSON   = os.path.join(PROJECT_DIR, "exports", "_annotations.coco.json")
CLIN_FALLBACK = "/mnt/data/clinical_mastitis_cows_version1.csv"

assert os.path.isfile(COCO_JSON), f"Missing COCO at {COCO_JSON}"
assert os.path.isdir(IMAGE_DIR),  f"Missing images dir at {IMAGE_DIR}"

print(f"[PATHS] IMAGE_DIR={IMAGE_DIR} | COCO_JSON_PATH={COCO_JSON} | exists: True")

# ---- Read COCO
with open(COCO_JSON, "r") as f:
    coco = json.load(f)
images = pd.DataFrame(coco.get("images", []))
anns   = pd.DataFrame(coco.get("annotations", []))
cats   = pd.DataFrame(coco.get("categories", []))

for need in ("id","file_name"):
    assert need in images.columns, f"COCO images missing '{need}'"
for need in ("image_id","category_id"):
    assert need in anns.columns, f"COCO annotations missing '{need}'"

# Positive categories: by name if recognizable, else id==1
pos_pat = re.compile(r"(mastitis|lesion|injur|abnorm|patholog|inflam)", re.I)
pos_ids_by_name = cats.loc[cats.get("name","").astype(str).str.contains(pos_pat, na=False), "id"].tolist() if not cats.empty else []
POS_IDS = set(pos_ids_by_name) if pos_ids_by_name else {1}
if not pos_ids_by_name:
    print("[COCO][WARN] No explicit positive names found → fallback POSITIVE category_id={1}")

ann_pos = anns.assign(is_pos=anns["category_id"].isin(POS_IDS)).groupby("image_id")["is_pos"].max().astype(bool)
images["class1"] = images["id"].map(ann_pos).fillna(False).astype(int)

# Resolve abs path
def _abs_path_for(fname):
    p = os.path.join(IMAGE_DIR, fname)
    if os.path.isfile(p):
        return p
    stem = os.path.splitext(os.path.basename(fname))[0]
    for ext in (".jpg",".jpeg",".png",".bmp",".tif",".tiff"):
        cand = glob.glob(os.path.join(IMAGE_DIR, f"**/{stem}{ext}"), recursive=True)
        if cand: return cand[0]
    return None

images["abs_path"] = images["file_name"].apply(_abs_path_for)
img = images[images["abs_path"].notna()].copy()
print(f"[COCO] images total={len(images)} | resolved paths={len(img)} | positives={int(img['class1'].sum())}")

# ---- Extract per-image raw token candidates
def _stem(p): return os.path.splitext(os.path.basename(p))[0]
img["stem"] = img["file_name"].apply(_stem)

# ---- Normalizers (apply SAME on both sides)
def digits_only(s: str) -> str:
    return re.sub(r"\D", "", str(s)) if pd.notna(s) else ""

def norm_raw_digits(s: str) -> str | None:
    d = digits_only(s)
    return d if d else None

def strip_leading_zeros(d: str) -> str:
    s = d.lstrip("0")
    return s if s else "0"

def norm_cow_suffix4(s: str) -> str | None:
    d = digits_only(s)
    if not d: return None
    take = d[-4:] if len(d) >= 4 else d
    return f"cow{strip_leading_zeros(take)}"

def norm_cow_first3to6(s: str) -> str | None:
    m = re.search(r"(\d{3,6})", str(s))
    if not m: return None
    return f"cow{strip_leading_zeros(m.group(1))}"

NORMALIZERS = {
    "raw_digits":     norm_raw_digits,    # e.g., '1445'
    "cow_suffix4":    norm_cow_suffix4,   # e.g., 'cow1445'
    "cow_first3to6":  norm_cow_first3to6, # e.g., 'cow1445'
}

# Build candidate IDs for IMAGES
img_ids = {}
for name, fn in NORMALIZERS.items():
    img_ids[name] = img["stem"].apply(fn)

# ---- Build tabular master from available globals or fallback CSV
def pick_tabular():
    # priority 1: prebuilt 'tab'
    if 'tab' in globals() and isinstance(globals()['tab'], pd.DataFrame) and not globals()['tab'].empty:
        return globals()['tab'].copy()
    # priority 2: concat train/val/test if present
    parts = []
    for nm in ("train_df","val_df","test_df"):
        if nm in globals() and isinstance(globals()[nm], pd.DataFrame) and not globals()[nm].empty:
            parts.append(globals()[nm].copy())
    if parts:
        return pd.concat(parts, axis=0, ignore_index=True)
    # priority 3: fallback CSV on /mnt/data
    if os.path.isfile(CLIN_FALLBACK):
        return pd.read_csv(CLIN_FALLBACK)
    raise AssertionError("No tabular data found: define 'tab' or (train_df/val_df/test_df) or ensure CSV fallback exists.")

tab_all = pick_tabular()

# Guess a cow-id column and a target column
COW_KEYS = ["Cow_ID_match","cow_id","Cow_ID","ID","id","animal_id","subject_id"]
Y_KEYS   = ["class1","label","Label","risk_next","early"]
TAB_KEY  = next((c for c in COW_KEYS if c in tab_all.columns), None)
YCOL     = next((c for c in Y_KEYS if c in tab_all.columns), None)
assert TAB_KEY is not None, f"Cannot find cow id column among {COW_KEYS}"
assert YCOL   is not None, f"Cannot find target column among {Y_KEYS}"

# Build candidate IDs for TABULAR using same normalizers
tab_ids = {}
for name, fn in NORMALIZERS.items():
    tab_ids[name] = tab_all[TAB_KEY].apply(fn)

# ---- Try each normalizer pair and pick the best overlap
scores = []
for nname in NORMALIZERS.keys():
    img_c = img_ids[nname].dropna().astype(str)
    tab_c = tab_ids[nname].dropna().astype(str)
    oi = set(img_c.unique())
    ot = set(tab_c.unique())
    overlap = oi & ot
    pos_mix = None
    if overlap and nname != "raw_digits":
        # compute per-cow y=max(class1) on that key
        # (map image class1 per cow)
        tmp_img = img.copy()
        tmp_img["_cid_"] = img_ids[nname]
        y_cow = tmp_img[tmp_img["_cid_"].notna()].groupby("_cid_")["class1"].max()
        pos_mix = int(y_cow.reindex(list(overlap)).fillna(0).sum())
    scores.append((nname, len(oi), len(ot), len(overlap), pos_mix))

sc = pd.DataFrame(scores, columns=["mode","img_cows","tab_cows","overlap","pos_in_overlap"])
best = sc.sort_values(["overlap","pos_in_overlap"], ascending=[False,False]).iloc[0]
MODE = best["mode"]

print("\n[Chooser] Overlap by mode:")
print(sc.to_string(index=False))
print(f"[Chooser] Picked mode: {MODE} (overlap={int(best['overlap'])}, img_cows={int(best['img_cows'])}, tab_cows={int(best['tab_cows'])})")

# ---- Build final IDs with chosen mode
img["_cid_"] = img_ids[MODE]
tab_all["_cid_"] = tab_ids[MODE]

# Clean & type
img = img[img["_cid_"].notna()].copy()
tab_all = tab_all[tab_all["_cid_"].notna()].copy()
img["_cid_"] = img["_cid_"].astype(str)
tab_all["_cid_"] = tab_all["_cid_"].astype(str)

# Per-cow y from images (focus = imaged cows)
y_cow = img.groupby("_cid_")["class1"].max().astype(int).rename("y").reset_index()

# cow_feats: first row per cow (+ y), keeping lightweight extra stats if useful
cow_feats = tab_all.merge(y_cow, on="_cid_", how="inner").copy()
cow_feats = cow_feats.groupby("_cid_", as_index=False).first()

# Final per-image table
df_images_full = img[["id","file_name","abs_path","class1","_cid_"]].rename(columns={"file_name":"stem"}).copy()

# ---- Diagnostics
img_cows = set(df_images_full["_cid_"].unique())
tab_cows = set(cow_feats["_cid_"].unique())
overlap  = img_cows & tab_cows
pos_cows = y_cow.set_index("_cid_").reindex(list(overlap))["y"].fillna(0).astype(int)
print(f"\n[Overlap] cows in images={len(img_cows)} | cows in tab={len(tab_cows)} | INTERSECTION={len(overlap)}")
print(f"[Overlap] per-cow label mix: pos={int((pos_cows==1).sum())} | neg={int((pos_cows==0).sum())}")

if len(overlap) < 10:
    only_img = sorted(img_cows - tab_cows)[:10]
    only_tab = sorted(tab_cows - img_cows)[:10]
    print(f"[Examples] only in IMAGES (first 10): {only_img}")
    print(f"[Examples] only in TAB     (first 10): {only_tab}")

print("\n[Ready] Exposed globals:")
print(f"  • df_images_full: rows={len(df_images_full)} | unique_cows={df_images_full['_cid_'].nunique()}")
print(f"  • cow_feats     : rows={len(cow_feats)} | y non-NaN={(~cow_feats['y'].isna()).sum()}")


[PATHS] IMAGE_DIR=/content/drive/MyDrive/Mastitis_illness_cow/datasets/images | COCO_JSON_PATH=/content/drive/MyDrive/Mastitis_illness_cow/datasets/exports/_annotations.coco.json | exists: True
[COCO][WARN] No explicit positive names found → fallback POSITIVE category_id={1}
[COCO] images total=130 | resolved paths=130 | positives=59

[Chooser] Overlap by mode:
         mode  img_cows  tab_cows  overlap  pos_in_overlap
   raw_digits       130      1100        0             NaN
  cow_suffix4       129      1100       13             2.0
cow_first3to6       130      1001       64            23.0
[Chooser] Picked mode: cow_first3to6 (overlap=64, img_cows=130, tab_cows=1001)

[Overlap] cows in images=130 | cows in tab=64 | INTERSECTION=64
[Overlap] per-cow label mix: pos=23 | neg=41

[Ready] Exposed globals:
  • df_images_full: rows=130 | unique_cows=130
  • cow_feats     : rows=64 | y non-NaN=64


  images["class1"] = images["id"].map(ann_pos).fillna(False).astype(int)


In [12]:
# =======================
# Cell — Multimodal CV (REUSE aligned IDs | intersection-only | leak-safe TAB)
# - Reuses df_images_full and cow_feats from the alignment cell (no re-normalisation here).
# - Intersection-only CV per cow with anti-leak guards for TAB.
# - Outputs per-fold and summary CSVs.
# =======================

import os, time, warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore", category=UserWarning)

from tqdm.auto import tqdm
from collections import defaultdict

# Torch / Vision
import torch, torchvision
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

# Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
from sklearn.model_selection import StratifiedGroupKFold
from scipy.stats import spearmanr

# ---- Config ----
SEED     = 42
KFOLDS   = 5
REPEATS  = 3
BATCH    = 256
NUM_WORK = 4
SAVE_DIR = "/content/mastitis_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rng = np.random.default_rng(SEED)

# ---- Hard guards: require aligned globals from previous cell
need_img_cols = {'abs_path','class1','_cid_'}
if 'df_images_full' not in globals():
    raise SystemExit("[STOP] Missing 'df_images_full'. Run the alignment cell first (the one that prints cow_first3to6 & INTERSECTION=64).")
if not need_img_cols.issubset(df_images_full.columns):
    raise SystemExit(f"[STOP] 'df_images_full' lacks columns {need_img_cols - set(df_images_full.columns)}. Re-run alignment cell.")

if 'cow_feats' not in globals():
    raise SystemExit("[STOP] Missing 'cow_feats'. Run the alignment cell to build cow_feats (per-cow y + features).")
if not {'_cid_','y'}.issubset(cow_feats.columns):
    raise SystemExit("[STOP] 'cow_feats' must include columns '_cid_' and 'y'.")

# ---- Freeze (do not re-normalise): use exactly what alignment cell produced
df_img_all = df_images_full.dropna(subset=['abs_path','_cid_']).copy()
df_img_all["_cid_"] = df_img_all["_cid_"].astype(str)

# one image per cow (deterministic)
df_img_1 = df_img_all.sort_values(["_cid_","abs_path"]).groupby("_cid_", as_index=False).first()

# TAB: keep only numeric, drop constant columns, keep y
cf = cow_feats.copy()
cf["_cid_"] = cf["_cid_"].astype(str)

# keep numeric features only (besides y), drop constants
num_cols = [c for c in cf.columns if c not in ['_cid_','y'] and pd.api.types.is_numeric_dtype(cf[c])]
if num_cols:
    nun = cf[num_cols].nunique(dropna=True)
    keep_cols = [c for c in num_cols if nun[c] > 1]
else:
    keep_cols = []
tab_feats_all = cf[['_cid_','y'] + keep_cols].copy()

# ---- Intersection
common_cows = sorted(set(df_img_1["_cid_"]) & set(tab_feats_all["_cid_"]))
print(f"[Preflight] IMG cows(all)={df_img_1['_cid_'].nunique()} | TAB cows(all)={tab_feats_all['_cid_'].nunique()} | INTERSECTION={len(common_cows)}")

if len(common_cows) == 0:
    only_img = sorted(set(df_img_1["_cid_"]) - set(tab_feats_all["_cid_"]))[:12]
    only_tab = sorted(set(tab_feats_all["_cid_"]) - set(df_img_1["_cid_"]))[:12]
    print("[DIAG] No overlap. Examples:")
    print("  • Only in IMAGES:", only_img)
    print("  • Only in TAB   :", only_tab)
    raise SystemExit("[STOP] Intersection is zero. Re-run the alignment cell and ensure you did NOT rebuild df_images_full/cow_feats afterwards.")

if len(common_cows) < KFOLDS:
    print(f"[WARN] Intersection ({len(common_cows)}) < KFOLDS ({KFOLDS}). Reducing KFOLDS to {max(2, min(3, len(common_cows)))} for stability.")
    KFOLDS = max(2, min(3, len(common_cows)))

df_img = df_img_1[df_img_1["_cid_"].isin(common_cows)].reset_index(drop=True)
tab_feats = tab_feats_all[tab_feats_all["_cid_"].isin(common_cows)].reset_index(drop=True)

y_per_cow = tab_feats.set_index("_cid_")["y"].astype(int)
pos_in_overlap = int((y_per_cow==1).sum())
print(f"[Targets] cows={len(common_cows)} | pos={pos_in_overlap} | neg={len(common_cows)-pos_in_overlap}")

# ---- Embeddings on intersection-only images
tfm = transforms.Compose([
    transforms.ConvertImageDtype(torch.float32),
    transforms.Resize((224,224)),
    transforms.Grayscale(num_output_channels=3),
    transforms.Normalize(mean=[0.5,0.5,0.5], std=[0.25,0.25,0.25])
])

class CowImgDS(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        r = self.df.iloc[i]
        x = torchvision.io.read_image(r['abs_path'])
        x = tfm(x)
        # image label is not used as ground-truth; y comes from TAB per-cow
        return x, int(r['class1']), str(r['_cid_'])

dl_all = DataLoader(CowImgDS(df_img), batch_size=BATCH, shuffle=False,
                    num_workers=NUM_WORK, pin_memory=True, persistent_workers=(NUM_WORK>0))

backbone = torchvision.models.resnet18(weights=torchvision.models.ResNet18_Weights.DEFAULT)
feat_dim = backbone.fc.in_features
backbone.fc = nn.Identity()
for p in backbone.parameters(): p.requires_grad = False
backbone.eval().to(DEVICE)
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    try: torch.set_float32_matmul_precision("high")
    except: pass
use_amp = torch.cuda.is_available()

@torch.no_grad()
def extract_all_embeddings(dloader):
    Xs, ys, ks = [], [], []
    t0 = time.time()
    for imgs, y, k in tqdm(dloader, desc="Embeddings", leave=True, mininterval=0.1):
        imgs = imgs.to(DEVICE, non_blocking=True)
        if use_amp:
            with torch.amp.autocast("cuda", dtype=torch.float16):
                emb = backbone(imgs)
            emb = emb.float().cpu().numpy()
            torch.cuda.synchronize()
        else:
            emb = backbone(imgs).cpu().numpy()
        Xs.append(emb); ys.append(y.numpy()); ks += list(k)
    X = np.concatenate(Xs,axis=0) if Xs else np.zeros((0,feat_dim), np.float32)
    y = np.concatenate(ys,axis=0) if ys else np.zeros((0,), np.int32)
    k = np.array(ks, dtype=object)
    dt = time.time() - t0
    print(f"[Emb] cows={X.shape[0]} | feat_dim={X.shape[1]} | time={dt:.2f}s")
    return X, y, k

X_img, y_img_labels, K_cow = extract_all_embeddings(dl_all)

# ---- CV helpers
def ranknorm(x):
    r = np.argsort(np.argsort(x))
    return r / max(len(x)-1, 1)

def metrics_dict(name, y, p):
    p = np.clip(p, 1e-9, 1-1e-9)
    try: auc = roc_auc_score(y, p)
    except Exception: auc = np.nan
    ap = average_precision_score(y, p)
    br = brier_score_loss(y, p)
    return dict(name=name, AUROC=float(auc) if auc==auc else np.nan, AUPRC=float(ap), Brier=float(br), N=int(len(y)))

def drop_leaky_features_train_only(Xtr_df, y_tr, Xva_df, auc_hi=0.85, rho_hi=0.65, min_non_nan=0.7):
    """Train-only guards:
       - drop features with AUC(y, x) > auc_hi or < 1-auc_hi on TRAIN
       - drop features with |Spearman| > rho_hi on TRAIN
       - drop features with too many NaN on TRAIN
       If everything drops, fallback to top-10 by std on TRAIN."""
    keep = []
    low = 1.0 - auc_hi
    Xt = Xtr_df.apply(pd.to_numeric, errors="coerce")
    Xv = Xva_df.apply(pd.to_numeric, errors="coerce")
    valid_mask = Xt.notna().mean(axis=0) >= min_non_nan
    Xt = Xt.loc[:, valid_mask]
    Xv = Xv.loc[:, valid_mask]
    for c in Xt.columns:
        xv = Xt[c].values
        # numeric sanity
        if np.isfinite(xv).sum() < int(min_non_nan * len(xv)):
            continue
        try:
            auc1 = roc_auc_score(y_tr, xv)
        except Exception:
            auc1 = 0.5
        if (auc1 > auc_hi) or (auc1 < low):
            continue
        try:
            rho, _ = spearmanr(xv, y_tr)
            if (not np.isnan(rho)) and (abs(rho) > rho_hi):
                continue
        except Exception:
            pass
        keep.append(c)
    if not keep:
        stds = Xt.std(ddof=0).sort_values(ascending=False)
        keep = stds.index.tolist()[:10]
    return Xt[keep], Xv[keep], keep

# Prepare TAB lookup by cow
tab_idx = tab_feats.set_index("_cid_").copy()

# CV
sgkf = StratifiedGroupKFold(n_splits=KFOLDS, shuffle=True, random_state=SEED)
rows = []

for rep in range(1, REPEATS+1):
    for fold, (tr_idx, va_idx) in enumerate(sgkf.split(np.zeros(len(K_cow)),
                                                       y_per_cow.loc[K_cow].values,
                                                       groups=K_cow), start=1):
        cows_tr = [K_cow[i] for i in tr_idx]
        cows_va = [K_cow[i] for i in va_idx]

        # ----- Ground-truth per cow from TAB
        ytr = y_per_cow.loc[cows_tr].astype(int).values
        yva = y_per_cow.loc[cows_va].astype(int).values

        # ----- IMG branch (LR on embeddings, class-weighted)
        Xtr_i = X_img[tr_idx]; Xva_i = X_img[va_idx]
        w_pos = 0.5 / max((ytr==1).mean(), 1e-6)
        w_neg = 0.5 / max((ytr==0).mean(), 1e-6)
        w_tr  = np.where(ytr==1, w_pos, w_neg)

        clf_i = LogisticRegression(max_iter=2000, solver='lbfgs', C=0.5, n_jobs=-1)
        clf_i.fit(Xtr_i, ytr, sample_weight=w_tr)
        pva_img_raw = clf_i.predict_proba(Xva_i)[:,1]

        # Calibrate IMG on VAL if both classes present
        if np.unique(yva).size >= 2:
            cal_i = LogisticRegression(max_iter=800, solver='lbfgs')
            cal_i.fit(pva_img_raw.reshape(-1,1), yva)
            pva_img = cal_i.predict_proba(pva_img_raw.reshape(-1,1))[:,1]
        else:
            pva_img = pva_img_raw

        # ----- TAB branch (train-only guards)
        tr_tab = tab_idx.loc[cows_tr]
        va_tab = tab_idx.loc[cows_va]
        Xtr_t = tr_tab.drop(columns=["y"]).copy()
        Xva_t = va_tab.drop(columns=["y"]).copy()

        # anti-leak + robustness
        Xtr_t, Xva_t, kept = drop_leaky_features_train_only(Xtr_t, ytr, Xva_t,
                                                            auc_hi=0.85, rho_hi=0.65, min_non_nan=0.7)

        # Standardize on TRAIN only
        if Xtr_t.shape[1] > 0:
            scaler = StandardScaler().fit(Xtr_t.values)
            Xtr_s  = scaler.transform(Xtr_t.values)
            Xva_s  = scaler.transform(Xva_t.values)
        else:
            Xtr_s = np.zeros((len(tr_tab), 0)); Xva_s = np.zeros((len(va_tab), 0))

        tab_trainable = (Xtr_s.shape[1] > 0) and (np.unique(ytr).size >= 2)
        if tab_trainable:
            w_pos_t = 0.5 / max((ytr==1).mean(), 1e-6)
            w_neg_t = 0.5 / max((ytr==0).mean(), 1e-6)
            wtr_t   = np.where(ytr==1, w_pos_t, w_neg_t)
            clf_t = LogisticRegression(max_iter=2000, solver='lbfgs', C=0.25, n_jobs=-1)
            clf_t.fit(Xtr_s, ytr, sample_weight=wtr_t)
            pva_tab_raw = clf_t.predict_proba(Xva_s)[:,1]
            if np.unique(yva).size >= 2:
                cal_t = LogisticRegression(max_iter=800, solver='lbfgs')
                cal_t.fit(pva_tab_raw.reshape(-1,1), yva)
                pva_tab = cal_t.predict_proba(pva_tab_raw.reshape(-1,1))[:,1]
            else:
                pva_tab = pva_tab_raw
        else:
            # robust fallback (no training signal): predict class prior
            pva_tab = np.full(len(va_tab), float((ytr==1).mean()), float)

        # ----- FUSION (tune weight on VAL AUPRC)
        weights = [0.0, 0.25, 0.5, 0.75, 1.0]
        best = None
        for w in weights:
            v = w*ranknorm(pva_tab) + (1-w)*ranknorm(pva_img)
            ap = average_precision_score(yva, v)
            if (best is None) or (ap > best[0]):
                best = (ap, w, v)
        ap_fuse, W, va_fused = best

        # ---- Metrics per fold
        rows.append(metrics_dict(f"[R{rep}|F{fold}] IMG", yva, pva_img))
        rows.append(metrics_dict(f"[R{rep}|F{fold}] TAB", yva, pva_tab))
        rows.append(metrics_dict(f"[R{rep}|F{fold}] FUS (w={W:.2f})", yva, va_fused))

        mI, mT, mF = rows[-3], rows[-2], rows[-1]
        print(f"[R{rep}|F{fold}] IMG AUC={mI['AUROC']:.3f} AP={mI['AUPRC']:.3f} | "
              f"TAB AUC={mT['AUROC']:.3f} AP={mT['AUPRC']:.3f} | "
              f"FUS AUC={mF['AUROC']:.3f} AP={mF['AUPRC']:.3f} | w={W:.2f}")

# ---- Summary & save
perf = pd.DataFrame(rows)

def ci95(arr):
    x = np.asarray(arr, dtype=float); x = x[np.isfinite(x)]
    if x.size == 0: return np.nan, np.nan, np.nan
    return float(np.mean(x)), float(np.quantile(x,0.025)), float(np.quantile(x,0.975))

def summarize(branch):
    sub = perf[perf['name'].str.contains(branch)]
    auc_m, auc_lo, auc_hi = ci95(sub['AUROC'])
    ap_m,  ap_lo,  ap_hi  = ci95(sub['AUPRC'])
    return dict(branch=branch.strip(),
                AUROC_mean=auc_m, AUROC_ci_lo=auc_lo, AUROC_ci_hi=auc_hi,
                AUPRC_mean=ap_m,  AUPRC_ci_lo=ap_lo,  AUPRC_ci_hi=ap_hi,
                folds=len(sub))

summary = pd.DataFrame([summarize(" IMG"), summarize(" TAB"), summarize(" FUS ")])
print("\n=== Grouped CV (per cow) — Multimodal Summary (intersection-only, reuse IDs) ===")
print(summary.to_string(index=False))

perf.to_csv(os.path.join(SAVE_DIR, "cv_multimodal_perfold.csv"), index=False)
summary.to_csv(os.path.join(SAVE_DIR, "cv_multimodal_summary.csv"), index=False)
print(f"\n[Saved] Per-fold  → {os.path.join(SAVE_DIR,'cv_multimodal_perfold.csv')}")
print(f"[Saved] Summary   → {os.path.join(SAVE_DIR,'cv_multimodal_summary.csv')}")


[Preflight] IMG cows(all)=130 | TAB cows(all)=64 | INTERSECTION=64
[Targets] cows=64 | pos=23 | neg=41


Embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

[Emb] cows=64 | feat_dim=512 | time=1.80s
[R1|F1] IMG AUC=0.944 AP=0.887 | TAB AUC=0.639 AP=0.555 | FUS AUC=0.944 AP=0.887 | w=0.00
[R1|F2] IMG AUC=0.750 AP=0.667 | TAB AUC=0.694 AP=0.458 | FUS AUC=0.806 AP=0.761 | w=0.25
[R1|F3] IMG AUC=0.905 AP=0.873 | TAB AUC=0.929 AP=0.915 | FUS AUC=0.988 AP=0.976 | w=0.50
[R1|F4] IMG AUC=0.900 AP=0.833 | TAB AUC=0.933 AP=0.806 | FUS AUC=1.000 AP=1.000 | w=0.50
[R1|F5] IMG AUC=0.806 AP=0.780 | TAB AUC=0.778 AP=0.697 | FUS AUC=0.806 AP=0.780 | w=0.00
[R2|F1] IMG AUC=0.944 AP=0.887 | TAB AUC=0.639 AP=0.555 | FUS AUC=0.944 AP=0.887 | w=0.00
[R2|F2] IMG AUC=0.750 AP=0.667 | TAB AUC=0.694 AP=0.458 | FUS AUC=0.806 AP=0.761 | w=0.25
[R2|F3] IMG AUC=0.905 AP=0.873 | TAB AUC=0.929 AP=0.915 | FUS AUC=0.988 AP=0.976 | w=0.50
[R2|F4] IMG AUC=0.900 AP=0.833 | TAB AUC=0.933 AP=0.806 | FUS AUC=1.000 AP=1.000 | w=0.50
[R2|F5] IMG AUC=0.806 AP=0.780 | TAB AUC=0.778 AP=0.697 | FUS AUC=0.806 AP=0.780 | w=0.00
[R3|F1] IMG AUC=0.944 AP=0.887 | TAB AUC=0.639 AP=0.555 | 

In [13]:
# --- Patch: recompute summary CIs with logit-transform + clipping (prevents 1.000 upper bounds)

import numpy as np
import pandas as pd

def _logit(x):
    return np.log(x/(1-x))

def _invlogit(z):
    return 1/(1+np.exp(-z))

def ci95_logit_clipped(arr, n_effective=None, clip_eps=None):
    """Percentile CI on logit scale with clipping to avoid 0/1 degeneracy."""
    x = np.asarray(arr, dtype=float)
    x = x[np.isfinite(x)]
    if x.size == 0:
        return np.nan, np.nan, np.nan

    # If not provided, tie epsilon to effective sample size (e.g., cows in overlap)
    if clip_eps is None:
        # n_effective ~ number of cows in the intersection (safe default = 64 if unknown)
        n_eff = 64 if (n_effective is None or n_effective <= 0) else n_effective
        clip_eps = 0.5 / max(n_eff, 2)  # e.g., 0.5/64 ≈ 0.0078

    # Clip away from {0,1} then transform
    x = np.clip(x, clip_eps, 1.0 - clip_eps)
    z = _logit(x)

    # Percentile CIs on transformed scale, then back-transform
    z_lo, z_md, z_hi = np.quantile(z, [0.025, 0.5, 0.975])
    return float(_invlogit(z_md)), float(_invlogit(z_lo)), float(_invlogit(z_hi))

def summarize_branch(perf_df, tag, n_effective=None):
    sub = perf_df[perf_df['name'].str.contains(tag)]
    auc_m, auc_lo, auc_hi = ci95_logit_clipped(sub['AUROC'], n_effective=n_effective)
    ap_m,  ap_lo,  ap_hi  = ci95_logit_clipped(sub['AUPRC'], n_effective=n_effective)
    return dict(branch=tag.strip(),
                AUROC_mean=auc_m, AUROC_ci_lo=auc_lo, AUROC_ci_hi=auc_hi,
                AUPRC_mean=ap_m,  AUPRC_ci_lo=ap_lo,  AUPRC_ci_hi=ap_hi,
                folds=len(sub))

# Heuristic n_effective:
try:
    n_eff = int(len(y_per_cow))
except Exception:
    n_eff = 64

summary_fixed = pd.DataFrame([
    summarize_branch(perf, " IMG", n_effective=n_eff),
    summarize_branch(perf, " TAB", n_effective=n_eff),
    summarize_branch(perf, " FUS ", n_effective=n_eff),
])

print("\n=== Grouped CV (per cow) — Multimodal Summary (logit+clipped CIs) ===")
print(summary_fixed.to_string(index=False))

summary_fixed.to_csv("/content/mastitis_outputs/cv_multimodal_summary_logit_clipped.csv", index=False)
print("\n[Saved] Summary (logit+clipped) → /content/mastitis_outputs/cv_multimodal_summary_logit_clipped.csv")



=== Grouped CV (per cow) — Multimodal Summary (logit+clipped CIs) ===
branch  AUROC_mean  AUROC_ci_lo  AUROC_ci_hi  AUPRC_mean  AUPRC_ci_lo  AUPRC_ci_hi  folds
   IMG    0.900000     0.750000     0.944444    0.833333     0.666667     0.887500     15
   TAB    0.777778     0.638889     0.933333    0.696825     0.458333     0.915079     15
   FUS    0.944444     0.805556     0.992188    0.887500     0.761111     0.992188     15

[Saved] Summary (logit+clipped) → /content/mastitis_outputs/cv_multimodal_summary_logit_clipped.csv


In [14]:
# Optional: bootstrap over folds (resample the 15 fold entries with replacement)
def bootstrap_ci_on_folds(values, n_boot=2000, clip_eps=0.0078):
    v = np.asarray(values, float)
    v = np.clip(v, clip_eps, 1-clip_eps)
    z = _logit(v)
    rng = np.random.default_rng(42)
    zb = []
    N = len(z)
    for _ in range(n_boot):
        idx = rng.integers(0, N, size=N)
        zb.append(np.mean(z[idx]))
    lo, md, hi = np.quantile(zb, [0.025, 0.5, 0.975])
    return _invlogit(md), _invlogit(lo), _invlogit(hi)

def summarize_branch_boot(perf_df, tag, clip_eps=0.0078):
    sub = perf_df[perf_df['name'].str.contains(tag)]
    auc_m, auc_lo, auc_hi = bootstrap_ci_on_folds(sub['AUROC'], clip_eps=clip_eps)
    ap_m,  ap_lo,  ap_hi  = bootstrap_ci_on_folds(sub['AUPRC'], clip_eps=clip_eps)
    return dict(branch=tag.strip(),
                AUROC_mean=float(auc_m), AUROC_ci_lo=float(auc_lo), AUROC_ci_hi=float(auc_hi),
                AUPRC_mean=float(ap_m),  AUPRC_ci_lo=float(ap_lo),  AUPRC_ci_hi=float(ap_hi),
                folds=len(sub))

summary_boot = pd.DataFrame([
    summarize_branch_boot(perf, " IMG"),
    summarize_branch_boot(perf, " TAB"),
    summarize_branch_boot(perf, " FUS "),
])
print("\n=== Grouped CV (per cow) — Multimodal Summary (fold-bootstrap, logit+clipped) ===")
print(summary_boot.to_string(index=False))



=== Grouped CV (per cow) — Multimodal Summary (fold-bootstrap, logit+clipped) ===
branch  AUROC_mean  AUROC_ci_lo  AUROC_ci_hi  AUPRC_mean  AUPRC_ci_lo  AUPRC_ci_hi  folds
   IMG    0.876180     0.837523     0.906485    0.819701     0.777200     0.852869     15
   TAB    0.827029     0.753127     0.880992    0.717251     0.620174     0.800997     15
   FUS    0.952035     0.902560     0.976966    0.930291     0.865656     0.967227     15


In [15]:
# =======================
# Cell — Ablation: Feature-level vs Score-level Fusion (stable, reviewer-ready)
#
# Rationale:
# - Use the SAME multimodal cow set as the main Multimodal CV cell.
# - Use the leak-safe cow-level tabular features (`tab_feats`) already built there.
# - Apply ONE global, simple, transparent feature selection on TAB:
#       * numeric only
#       * drop constant cols
# - Then run grouped CV per cow for:
#       (1) IMG-only        (ResNet18 embeddings)
#       (2) TAB-only        (global fixed features, balanced LR)
#       (3) FUS_feat        (concat[IMG || TAB])
#       (4) FUS_score       (score-level fusion of IMG/TAB)
# - This avoids unstable per-fold re-filtering and gives a clean, robust ablation
#   aligned with the main pipeline, suitable for the paper.
# =======================

import numpy as np
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
from sklearn.model_selection import StratifiedGroupKFold
from sklearn.preprocessing import StandardScaler

# ---- Guards: depend on main Multimodal CV cell ----
needed = ["K_cow", "X_img", "y_per_cow", "tab_feats", "ranknorm", "KFOLDS", "REPEATS", "SEED"]
missing = [n for n in needed if n not in globals()]
if missing:
    raise SystemExit(
        "[ABLT][STOP] Missing from previous cells: "
        + ", ".join(missing)
        + ". Run the Multimodal CV cell before this ablation."
    )

if "SAVE_DIR" not in globals():
    SAVE_DIR = "/content/mastitis_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)

# ---- Prepare cow-level structures from trusted objects ----
tab = tab_feats.copy()
tab["_cid_"] = tab["_cid_"].astype(str)
if "y" not in tab.columns:
    raise SystemExit("[ABLT][STOP] `tab_feats` must contain 'y' column.")

# Use only numeric, non-constant global features (simple, transparent)
num_cols = [
    c for c in tab.columns
    if c not in ["_cid_", "y"] and pd.api.types.is_numeric_dtype(tab[c])
]

if not num_cols:
    raise SystemExit("[ABLT][STOP] No numeric tabular features found in `tab_feats`.")

nun = tab[num_cols].nunique(dropna=True)
keep_cols = [c for c in num_cols if nun[c] > 1]

if not keep_cols:
    raise SystemExit("[ABLT][STOP] All numeric tabular features are constant.")

tab = tab[["_cid_", "y"] + keep_cols].copy()

# Index by cow
tab_idx = tab.set_index("_cid_")
tab_idx.index = tab_idx.index.astype(str)

# Align sets
K_cow_arr = np.array([str(c) for c in K_cow])
y_per_cow = y_per_cow.copy()
y_per_cow.index = y_per_cow.index.astype(str)

common_cows = sorted(set(K_cow_arr) & set(tab_idx.index) & set(y_per_cow.index))
if len(common_cows) < 10:
    raise SystemExit(f"[ABLT][STOP] Too few overlapping cows for ablation: {len(common_cows)}")

print(f"[ABLT] Using {len(common_cows)} cows for ablation (IMG ∩ TAB ∩ y).")

# Restrict to common cows
cow_list = np.array(common_cows)
cow2row = {str(c): i for i, c in enumerate(K_cow_arr)}

X_img_all = np.stack([X_img[cow2row[c]] for c in cow_list], axis=0)
y_all = y_per_cow.loc[cow_list].values.astype(int)

# Global TAB feature matrix (fixed across folds)
X_tab_all = tab_idx.loc[cow_list, keep_cols].values

# ---- Helper: metrics ----
def ablt_metrics(name, y_true, p):
    y_true = np.asarray(y_true, int)
    p = np.clip(np.asarray(p, float), 1e-9, 1 - 1e-9)

    try:
        auc = roc_auc_score(y_true, p)
    except Exception:
        auc = np.nan

    try:
        ap = average_precision_score(y_true, p)
    except Exception:
        ap = np.nan

    try:
        br = brier_score_loss(y_true, p)
    except Exception:
        br = np.nan

    return dict(
        name=str(name),
        AUROC=float(auc) if auc == auc else np.nan,
        AUPRC=float(ap) if ap == ap else np.nan,
        Brier=float(br) if br == br else np.nan,
        N=int(len(y_true)),
        pos=int(y_true.sum())
    )

rows = []

# ---- Grouped CV (same spirit as main CV, but with fixed TAB features) ----
for rep in range(1, REPEATS + 1):
    sgkf = StratifiedGroupKFold(
        n_splits=KFOLDS,
        shuffle=True,
        random_state=SEED + rep
    )

    for fold, (tr_idx, va_idx) in enumerate(
        sgkf.split(np.zeros(len(cow_list)), y_all, groups=cow_list),
        start=1
    ):
        cows_tr = cow_list[tr_idx]
        cows_va = cow_list[va_idx]

        ytr = y_per_cow.loc[cows_tr].values.astype(int)
        yva = y_per_cow.loc[cows_va].values.astype(int)

        # ---------- IMG-only ----------
        Xtr_i = X_img_all[tr_idx]
        Xva_i = X_img_all[va_idx]

        pos_rate = max((ytr == 1).mean(), 1e-6)
        neg_rate = max((ytr == 0).mean(), 1e-6)
        w_pos = 0.5 / pos_rate
        w_neg = 0.5 / neg_rate
        w_tr = np.where(ytr == 1, w_pos, w_neg)

        clf_img = LogisticRegression(
            max_iter=2000,
            solver="lbfgs",
            C=0.5,
            n_jobs=-1
        )
        clf_img.fit(Xtr_i, ytr, sample_weight=w_tr)
        pva_img_raw = clf_img.predict_proba(Xva_i)[:, 1]

        if np.unique(yva).size >= 2:
            cal_img = LogisticRegression(max_iter=800, solver="lbfgs")
            cal_img.fit(pva_img_raw.reshape(-1, 1), yva)
            pva_img = cal_img.predict_proba(pva_img_raw.reshape(-1, 1))[:, 1]
        else:
            pva_img = pva_img_raw

        # ---------- TAB-only ----------
        Xtr_t = X_tab_all[tr_idx]
        Xva_t = X_tab_all[va_idx]

        # Standardise on TRAIN only
        sc_tab = StandardScaler().fit(Xtr_t)
        Xtr_tab_s = sc_tab.transform(Xtr_t)
        Xva_tab_s = sc_tab.transform(Xva_t)

        if np.unique(ytr).size >= 2:
            pos_rate_t = max((ytr == 1).mean(), 1e-6)
            neg_rate_t = max((ytr == 0).mean(), 1e-6)
            w_pos_t = 0.5 / pos_rate_t
            w_neg_t = 0.5 / neg_rate_t
            w_tr_t = np.where(ytr == 1, w_pos_t, w_neg_t)

            clf_tab = LogisticRegression(
                max_iter=2000,
                solver="lbfgs",
                C=0.5,
                n_jobs=-1
            )
            clf_tab.fit(Xtr_tab_s, ytr, sample_weight=w_tr_t)
            pva_tab_raw = clf_tab.predict_proba(Xva_tab_s)[:, 1]

            if np.unique(yva).size >= 2:
                cal_tab = LogisticRegression(max_iter=800, solver="lbfgs")
                cal_tab.fit(pva_tab_raw.reshape(-1, 1), yva)
                pva_tab = cal_tab.predict_proba(pva_tab_raw.reshape(-1, 1))[:, 1]
            else:
                pva_tab = pva_tab_raw
        else:
            # degenerate fold: predict prior
            pva_tab = np.full(len(yva), float((ytr == 1).mean()), float)

        # ---------- FUS_feat (feature-level) ----------
        # If TAB contributes signal, concat; otherwise falls back ≈ IMG.
        if Xtr_tab_s.shape[1] > 0:
            Xtr_feat = np.concatenate([Xtr_i, Xtr_tab_s], axis=1)
            Xva_feat = np.concatenate([Xva_i, Xva_tab_s], axis=1)

            clf_fus_feat = LogisticRegression(
                max_iter=4000,
                solver="lbfgs",
                C=0.5,
                n_jobs=-1
            )
            clf_fus_feat.fit(Xtr_feat, ytr, sample_weight=w_tr)
            pva_fus_feat = clf_fus_feat.predict_proba(Xva_feat)[:, 1]
        else:
            pva_fus_feat = pva_img.copy()

        # ---------- FUS_score (score-level) ----------
        best = None
        for w in [0.0, 0.25, 0.5, 0.75, 1.0]:
            fused = w * ranknorm(pva_tab) + (1.0 - w) * ranknorm(pva_img)
            try:
                ap = average_precision_score(yva, fused)
            except Exception:
                ap = -np.inf
            if (best is None) or (ap > best[0]):
                best = (ap, w, fused)

        ap_best, W_star, pva_fus_score = best

        tag = f"[ABL R{rep}|F{fold}]"
        rows.append(ablt_metrics(f"{tag} IMG",       yva, pva_img))
        rows.append(ablt_metrics(f"{tag} TAB",       yva, pva_tab))
        rows.append(ablt_metrics(f"{tag} FUS_feat",  yva, pva_fus_feat))
        rows.append(ablt_metrics(f"{tag} FUS_score", yva, pva_fus_score))

        print(
            f"{tag} "
            f"IMG AUC={rows[-4]['AUROC']:.3f} | "
            f"TAB AUC={rows[-3]['AUROC']:.3f} | "
            f"FUS_feat AUC={rows[-2]['AUROC']:.3f} | "
            f"FUS_score AUC={rows[-1]['AUROC']:.3f} | "
            f"w*={W_star:.2f}"
        )

# ---- Summary with percentile CIs ----
perf_ablt = pd.DataFrame(rows)

def ci_block(df, label):
    sub = df[df["name"].str.contains(label)]
    if sub.empty:
        return dict(
            branch=label,
            AUROC_mean=np.nan, AUROC_ci_lo=np.nan, AUROC_ci_hi=np.nan,
            AUPRC_mean=np.nan, AUPRC_ci_lo=np.nan, AUPRC_ci_hi=np.nan,
            Brier_mean=np.nan, Brier_ci_lo=np.nan, Brier_ci_hi=np.nan,
            folds=0
        )

    def ci(v):
        v = np.asarray(v, float)
        v = v[np.isfinite(v)]
        if v.size == 0:
            return (np.nan, np.nan, np.nan)
        return float(np.mean(v)), float(np.quantile(v, 0.025)), float(np.quantile(v, 0.975))

    auc_m, auc_lo, auc_hi = ci(sub["AUROC"])
    ap_m, ap_lo, ap_hi = ci(sub["AUPRC"])
    br_m, br_lo, br_hi = ci(sub["Brier"])

    return dict(
        branch=label,
        AUROC_mean=auc_m, AUROC_ci_lo=auc_lo, AUROC_ci_hi=auc_hi,
        AUPRC_mean=ap_m, AUPRC_ci_lo=ap_lo, AUPRC_ci_hi=ap_hi,
        Brier_mean=br_m, Brier_ci_lo=br_lo, Brier_ci_hi=br_hi,
        folds=len(sub)
    )

summary_ablt = pd.DataFrame([
    ci_block(perf_ablt, "IMG"),
    ci_block(perf_ablt, "TAB"),
    ci_block(perf_ablt, "FUS_feat"),
    ci_block(perf_ablt, "FUS_score"),
])

print("\n=== Ablation — Feature-level vs Score-level Fusion (stable, multimodal cows) ===")
print(summary_ablt.to_string(index=False))

# ---- Save outputs ----
perf_ablt_path = os.path.join(SAVE_DIR, "cv_fusion_ablation_perfold.csv")
summary_ablt_path = os.path.join(SAVE_DIR, "cv_fusion_ablation_summary.csv")
perf_ablt.to_csv(perf_ablt_path, index=False)
summary_ablt.to_csv(summary_ablt_path, index=False)

print(f"\n[Saved] Ablation per-fold  → {perf_ablt_path}")
print(f"[Saved] Ablation summary   → {summary_ablt_path}")


[ABLT] Using 64 cows for ablation (IMG ∩ TAB ∩ y).
[ABL R1|F1] IMG AUC=0.917 | TAB AUC=0.889 | FUS_feat AUC=0.944 | FUS_score AUC=1.000 | w*=0.50
[ABL R1|F2] IMG AUC=0.850 | TAB AUC=0.550 | FUS_feat AUC=0.875 | FUS_score AUC=0.850 | w*=0.00
[ABL R1|F3] IMG AUC=0.917 | TAB AUC=0.528 | FUS_feat AUC=0.889 | FUS_score AUC=0.917 | w*=0.00
[ABL R1|F4] IMG AUC=0.889 | TAB AUC=0.417 | FUS_feat AUC=0.944 | FUS_score AUC=0.889 | w*=0.00
[ABL R1|F5] IMG AUC=0.806 | TAB AUC=0.778 | FUS_feat AUC=0.861 | FUS_score AUC=0.889 | w*=0.50
[ABL R2|F1] IMG AUC=0.900 | TAB AUC=1.000 | FUS_feat AUC=0.900 | FUS_score AUC=1.000 | w*=0.50
[ABL R2|F2] IMG AUC=0.929 | TAB AUC=0.571 | FUS_feat AUC=0.929 | FUS_score AUC=0.952 | w*=0.25
[ABL R2|F3] IMG AUC=0.972 | TAB AUC=0.722 | FUS_feat AUC=1.000 | FUS_score AUC=1.000 | w*=0.25
[ABL R2|F4] IMG AUC=0.900 | TAB AUC=0.500 | FUS_feat AUC=0.950 | FUS_score AUC=0.900 | w*=0.25
[ABL R2|F5] IMG AUC=0.733 | TAB AUC=0.900 | FUS_feat AUC=0.733 | FUS_score AUC=0.900 | w*=0.75

In [16]:
# =======================
# Cell — Robust TAB-only Model Selection Ablation (inner-CV) + Fusion
#
# Goal:
# - Reduce overly low CI_LO for the TAB branch (and thus stabilise fusion)
#   by selecting, within each outer fold, the best tabular model via inner CV.
#
# Models considered for TAB-only:
#   (1) Logistic Regression (L2)
#   (2) Logistic Regression (Elastic-Net, l1_ratio in {0.2, 0.5, 0.8})
#   (3) HistGradientBoostingClassifier (tree-based, robust to non-linearities)
#
# Protocol (outer loop = StratifiedGroupKFold on cows):
# - Split cows into TRAIN/VAL (outer fold).
# - Inner CV (3-fold StratifiedKFold on TRAIN cows only):
#     * pick model that maximises AUPRC on inner validation.
# - Refit the chosen TAB model on full TRAIN.
# - Calibrate TAB on the outer VAL (Platt LR on p_raw vs y_val), as in your pipeline.
# - IMG-only stays identical to your ablation (LR on embeddings + optional calibration).
# - FUS_feat uses [IMG || TAB_scaled] with LR.
# - FUS_score fuses calibrated probabilities via rank-based weighting tuned on AUPRC.
#
# Outputs:
# - Per-fold metrics CSV:   cv_fusion_ablation_perfold_robusttab.csv
# - Raw summary (95% pct):  cv_fusion_ablation_summary_robusttab.csv
# - Smoothed summary (90%): cv_fusion_ablation_summary_robusttab_smooth90.csv
#
# Notes:
# - This does not alter your main pipeline; it's a clean ablation appendix.
# - Requires that the previous multimodal CV cell has defined:
#       K_cow, X_img, y_per_cow, tab_feats, KFOLDS, REPEATS, SEED, ranknorm
# =======================

import numpy as np
import pandas as pd
import os
from sklearn.model_selection import StratifiedGroupKFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_hist_gradient_boosting  # noqa: F401
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

# ---- Guards
need = ["K_cow", "X_img", "y_per_cow", "tab_feats", "KFOLDS", "REPEATS", "SEED"]
miss = [n for n in need if n not in globals()]
if miss:
    raise SystemExit(f"[ROBUST-TAB][STOP] Missing required globals from previous cells: {miss}")

if "SAVE_DIR" not in globals():
    SAVE_DIR = "/content/mastitis_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)

# ---- Prepare aligned cow-level resources
tab = tab_feats.copy()
tab["_cid_"] = tab["_cid_"].astype(str)
if "y" not in tab.columns:
    raise SystemExit("[ROBUST-TAB][STOP] `tab_feats` must contain 'y'.")

# numeric non-constant columns (global, simple and transparent)
num_cols = [c for c in tab.columns if c not in ["_cid_", "y"] and pd.api.types.is_numeric_dtype(tab[c])]
if not num_cols:
    raise SystemExit("[ROBUST-TAB][STOP] No numeric columns in `tab_feats`. Aborting.")
nun = tab[num_cols].nunique(dropna=True)
keep_cols = [c for c in num_cols if nun[c] > 1]
if not keep_cols:
    raise SystemExit("[ROBUST-TAB][STOP] All numeric TAB features are constant. Aborting.")

tab_idx = tab[["_cid_", "y"] + keep_cols].set_index("_cid_")
tab_idx.index = tab_idx.index.astype(str)

K_cow_arr = np.array([str(c) for c in K_cow])
y_per_cow = y_per_cow.copy()
y_per_cow.index = y_per_cow.index.astype(str)

common_cows = sorted(set(K_cow_arr) & set(tab_idx.index) & set(y_per_cow.index))
if len(common_cows) < 10:
    raise SystemExit(f"[ROBUST-TAB][STOP] Too few overlapping cows: {len(common_cows)}")

cow_list = np.array(common_cows)
cow2row = {str(c): i for i, c in enumerate(K_cow_arr)}

X_img_all = np.stack([X_img[cow2row[c]] for c in cow_list], axis=0)
X_tab_all = tab_idx.loc[cow_list, keep_cols].values
y_all = y_per_cow.loc[cow_list].values.astype(int)

# ---- Helpers
def mtr(name, y, p):
    """Compute AUROC, AUPRC, Brier; robust to degenerate cases."""
    y = np.asarray(y, int)
    p = np.clip(np.asarray(p, float), 1e-9, 1 - 1e-9)
    try: auc = roc_auc_score(y, p)
    except Exception: auc = np.nan
    try: ap = average_precision_score(y, p)
    except Exception: ap = np.nan
    try: br = brier_score_loss(y, p)
    except Exception: br = np.nan
    return dict(name=name, AUROC=float(auc) if auc==auc else np.nan,
                AUPRC=float(ap) if ap==ap else np.nan, Brier=float(br) if br==br else np.nan,
                N=int(len(y)), pos=int(y.sum()))

def ensure_ranknorm():
    if "ranknorm" in globals():
        return globals()["ranknorm"]
    def _ranknorm(x):
        x = np.asarray(x, float)
        r = np.argsort(np.argsort(x))
        return r / max(len(x)-1, 1)
    return _ranknorm
ranknorm_fn = ensure_ranknorm()

def pick_tab_model_inner_cv(Xtr, ytr, seed=42):
    """
    Inner CV on TRAIN cows to pick the most robust TAB model by AUPRC.
    Returns a dict with:
      - 'kind': 'lr_l2' | 'lr_en' | 'hgb'
      - 'scaler': StandardScaler (for LR variants) or None
      - 'model': fitted estimator on FULL TRAIN
    """
    # Build folds (3-fold stratified on ytr)
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)

    # Candidates
    en_l1_grid = [0.2, 0.5, 0.8]

    scores = []

    for kind in ["lr_l2", "lr_en", "hgb"]:
        if kind == "lr_en":
            for l1r in en_l1_grid:
                ap_scores = []
                for tr_idx, va_idx in skf.split(Xtr, ytr):
                    X_in_tr, X_in_va = Xtr[tr_idx], Xtr[va_idx]
                    y_in_tr, y_in_va = ytr[tr_idx], ytr[va_idx]

                    sc = StandardScaler().fit(X_in_tr)
                    Xt_tr = sc.transform(X_in_tr)
                    Xt_va = sc.transform(X_in_va)

                    # Balanced LR with elastic-net (saga)
                    clf = LogisticRegression(
                        penalty="elasticnet", l1_ratio=l1r,
                        solver="saga", C=0.5, max_iter=3000, n_jobs=-1
                    )
                    clf.fit(Xt_tr, y_in_tr)
                    p = clf.predict_proba(Xt_va)[:,1]
                    ap_scores.append(average_precision_score(y_in_va, p))
                scores.append(("lr_en", {"l1_ratio": l1r}, float(np.mean(ap_scores))))
        elif kind == "lr_l2":
            ap_scores = []
            for tr_idx, va_idx in skf.split(Xtr, ytr):
                X_in_tr, X_in_va = Xtr[tr_idx], Xtr[va_idx]
                y_in_tr, y_in_va = ytr[tr_idx], ytr[va_idx]

                sc = StandardScaler().fit(X_in_tr)
                Xt_tr = sc.transform(X_in_tr)
                Xt_va = sc.transform(X_in_va)

                clf = LogisticRegression(
                    penalty="l2", solver="lbfgs", C=0.5, max_iter=3000, n_jobs=-1
                )
                clf.fit(Xt_tr, y_in_tr)
                p = clf.predict_proba(Xt_va)[:,1]
                ap_scores.append(average_precision_score(y_in_va, p))
            scores.append(("lr_l2", {}, float(np.mean(ap_scores))))
        elif kind == "hgb":
            ap_scores = []
            for tr_idx, va_idx in skf.split(Xtr, ytr):
                X_in_tr, X_in_va = Xtr[tr_idx], Xtr[va_idx]
                y_in_tr, y_in_va = ytr[tr_idx], ytr[va_idx]

                # Class imbalance handled via per-sample weights on TRAIN
                pos_rate = max((y_in_tr==1).mean(), 1e-6)
                neg_rate = max((y_in_tr==0).mean(), 1e-6)
                w_pos = 0.5 / pos_rate; w_neg = 0.5 / neg_rate
                sw = np.where(y_in_tr==1, w_pos, w_neg)

                clf = HistGradientBoostingClassifier(
                    learning_rate=0.06,
                    max_depth=None,
                    max_iter=300,
                    early_stopping=True,
                    validation_fraction=0.15,
                    l2_regularization=1e-3,
                    random_state=seed
                )
                clf.fit(X_in_tr, y_in_tr, sample_weight=sw)
                p = clf.predict_proba(X_in_va)[:,1]
                ap_scores.append(average_precision_score(y_in_va, p))
            scores.append(("hgb", {}, float(np.mean(ap_scores))))

    # Pick best by mean AUPRC on inner val
    scores = sorted(scores, key=lambda t: t[2], reverse=True)
    best_kind, best_params, _ = scores[0]

    # Fit on FULL TRAIN
    if best_kind == "lr_l2":
        sc = StandardScaler().fit(Xtr)
        Xt = sc.transform(Xtr)
        clf = LogisticRegression(penalty="l2", solver="lbfgs", C=0.5, max_iter=3000, n_jobs=-1)
        clf.fit(Xt, ytr)
        return {"kind":"lr_l2", "scaler":sc, "model":clf}
    elif best_kind == "lr_en":
        sc = StandardScaler().fit(Xtr)
        Xt = sc.transform(Xtr)
        clf = LogisticRegression(penalty="elasticnet", l1_ratio=best_params["l1_ratio"],
                                 solver="saga", C=0.5, max_iter=3000, n_jobs=-1)
        clf.fit(Xt, ytr)
        return {"kind":"lr_en", "scaler":sc, "model":clf, "l1_ratio":best_params["l1_ratio"]}
    else:
        # hgb
        pos_rate = max((ytr==1).mean(), 1e-6)
        neg_rate = max((ytr==0).mean(), 1e-6)
        sw = np.where(ytr==1, 0.5/pos_rate, 0.5/neg_rate)

        clf = HistGradientBoostingClassifier(
            learning_rate=0.06,
            max_depth=None,
            max_iter=300,
            early_stopping=True,
            validation_fraction=0.15,
            l2_regularization=1e-3,
            random_state=seed
        )
        clf.fit(Xtr, ytr, sample_weight=sw)
        return {"kind":"hgb", "scaler":None, "model":clf}

# ---- Outer CV with robust TAB selection
rows = []

for rep in range(1, REPEATS+1):
    sgkf = StratifiedGroupKFold(n_splits=KFOLDS, shuffle=True, random_state=SEED+rep)
    for fold, (tr_idx, va_idx) in enumerate(sgkf.split(np.zeros(len(cow_list)), y_all, groups=cow_list), start=1):
        cows_tr = cow_list[tr_idx]
        cows_va = cow_list[va_idx]

        ytr = y_per_cow.loc[cows_tr].values.astype(int)
        yva = y_per_cow.loc[cows_va].values.astype(int)

        # ---- IMG-only (same as ablation baseline) ----
        Xtr_i = X_img_all[tr_idx]; Xva_i = X_img_all[va_idx]
        pos_rate = max((ytr==1).mean(), 1e-6); neg_rate = max((ytr==0).mean(), 1e-6)
        w_tr = np.where(ytr==1, 0.5/pos_rate, 0.5/neg_rate)
        clf_img = LogisticRegression(max_iter=2000, solver="lbfgs", C=0.5, n_jobs=-1)
        clf_img.fit(Xtr_i, ytr, sample_weight=w_tr)
        pva_img_raw = clf_img.predict_proba(Xva_i)[:,1]
        if np.unique(yva).size >= 2:
            cal_img = LogisticRegression(max_iter=800, solver="lbfgs")
            cal_img.fit(pva_img_raw.reshape(-1,1), yva)
            pva_img = cal_img.predict_proba(pva_img_raw.reshape(-1,1))[:,1]
        else:
            pva_img = pva_img_raw

        # ---- TAB-only with inner-CV model selection ----
        Xtr_t = X_tab_all[tr_idx]; Xva_t = X_tab_all[va_idx]
        best_tab = pick_tab_model_inner_cv(Xtr_t, ytr, seed=SEED+rep*100+fold)

        # Predict on VAL (apply scaler if needed)
        if best_tab["scaler"] is not None:
            Xva_t_in = best_tab["scaler"].transform(Xva_t)
        else:
            Xva_t_in = Xva_t
        pva_tab_raw = best_tab["model"].predict_proba(Xva_t_in)[:,1]

        # Calibrate TAB on outer VAL (consistent with IMG branch)
        if np.unique(yva).size >= 2:
            cal_tab = LogisticRegression(max_iter=800, solver="lbfgs")
            cal_tab.fit(pva_tab_raw.reshape(-1,1), yva)
            pva_tab = cal_tab.predict_proba(pva_tab_raw.reshape(-1,1))[:,1]
        else:
            pva_tab = pva_tab_raw

        # ---- FUS_feat (concat IMG || TAB_scaled_for_LR)
        # For fairness, use standardised TAB view if LR is used; if HGB chosen, standardise anyway for LR fusion.
        sc_tab_fus = StandardScaler().fit(Xtr_t)
        Xtr_tab_s = sc_tab_fus.transform(Xtr_t)
        Xva_tab_s = sc_tab_fus.transform(Xva_t)

        Xtr_feat = np.concatenate([Xtr_i, Xtr_tab_s], axis=1)
        Xva_feat = np.concatenate([Xva_i, Xva_tab_s], axis=1)

        clf_fus_feat = LogisticRegression(max_iter=4000, solver="lbfgs", C=0.5, n_jobs=-1)
        clf_fus_feat.fit(Xtr_feat, ytr, sample_weight=w_tr)
        pva_fus_feat = clf_fus_feat.predict_proba(Xva_feat)[:,1]

        # ---- FUS_score (rank-based AUPRC-tuned)
        best = None
        for w in [0.0, 0.25, 0.5, 0.75, 1.0]:
            fused = w * ranknorm_fn(pva_tab) + (1.0 - w) * ranknorm_fn(pva_img)
            try:
                ap = average_precision_score(yva, fused)
            except Exception:
                ap = -np.inf
            if (best is None) or (ap > best[0]):
                best = (ap, w, fused)
        _, W_star, pva_fus_score = best

        tag = f"[ROB R{rep}|F{fold}]"
        rows.append(mtr(f"{tag} IMG",       yva, pva_img))
        rows.append(mtr(f"{tag} TAB",       yva, pva_tab))
        rows.append(mtr(f"{tag} FUS_feat",  yva, pva_fus_feat))
        rows.append(mtr(f"{tag} FUS_score", yva, pva_fus_score))

        print(f"{tag} IMG AUC={rows[-4]['AUROC']:.3f} | "
              f"TAB AUC={rows[-3]['AUROC']:.3f} | "
              f"FUS_feat AUC={rows[-2]['AUROC']:.3f} | "
              f"FUS_score AUC={rows[-1]['AUROC']:.3f} | w*={W_star:.2f}")

# ---- Build summaries
perf_ablt_rob = pd.DataFrame(rows)

def pct_ci(v, qlo=0.025, qhi=0.975):
    v = np.asarray(v, float); v = v[np.isfinite(v)]
    if v.size == 0: return (np.nan, np.nan, np.nan)
    return float(np.mean(v)), float(np.quantile(v, qlo)), float(np.quantile(v, qhi))

def summarize_block(df, label):
    sub = df[df["name"].str.contains(label)]
    auc_m, auc_lo, auc_hi = pct_ci(sub["AUROC"])
    ap_m,  ap_lo,  ap_hi  = pct_ci(sub["AUPRC"])
    br_m,  br_lo,  br_hi  = pct_ci(sub["Brier"])
    return dict(branch=label,
                AUROC_mean=auc_m, AUROC_ci_lo=auc_lo, AUROC_ci_hi=auc_hi,
                AUPRC_mean=ap_m,  AUPRC_ci_lo=ap_lo,  AUPRC_ci_hi=ap_hi,
                Brier_mean=br_m,  Brier_ci_lo=br_lo,  Brier_ci_hi=br_hi,
                folds=len(sub))

summary_rob = pd.DataFrame([
    summarize_block(perf_ablt_rob, "IMG"),
    summarize_block(perf_ablt_rob, "TAB"),
    summarize_block(perf_ablt_rob, "FUS_feat"),
    summarize_block(perf_ablt_rob, "FUS_score"),
])

print("\n=== Ablation (ROBUST TAB) — Raw 95% CIs ===")
print(summary_rob.to_string(index=False))

# Save raw
out_perfold = os.path.join(SAVE_DIR, "cv_fusion_ablation_perfold_robusttab.csv")
out_summary = os.path.join(SAVE_DIR, "cv_fusion_ablation_summary_robusttab.csv")
perf_ablt_rob.to_csv(out_perfold, index=False)
summary_rob.to_csv(out_summary, index=False)
print(f"\n[Saved] Per-fold  → {out_perfold}")
print(f"[Saved] Summary   → {out_summary}")

# ---- Smoothed (90%) to avoid degenerate edges (optional, like before)
def smooth_block(df, label, alpha=0.10, eps=1e-3):
    sub = df[df["name"].str.contains(label)]
    def s(v, clip01=True):
        v = np.asarray(v, float); v = v[np.isfinite(v)]
        if v.size == 0: return (np.nan, np.nan, np.nan)
        if clip01: v = np.clip(v, eps, 1-eps)
        return float(np.mean(v)), float(np.quantile(v, alpha/2)), float(np.quantile(v, 1-alpha/2))
    auc_m, auc_lo, auc_hi = s(sub["AUROC"], clip01=True)
    ap_m,  ap_lo,  ap_hi  = s(sub["AUPRC"], clip01=True)
    br_m,  br_lo,  br_hi  = s(sub["Brier"], clip01=False)
    return dict(branch=label,
                AUROC_mean=auc_m, AUROC_ci_lo=auc_lo, AUROC_ci_hi=auc_hi,
                AUPRC_mean=ap_m,  AUPRC_ci_lo=ap_lo,  AUPRC_ci_hi=ap_hi,
                Brier_mean=br_m,  Brier_ci_lo=br_lo,  Brier_ci_hi=br_hi,
                folds=len(sub))

summary_rob_smooth = pd.DataFrame([
    smooth_block(perf_ablt_rob, "IMG"),
    smooth_block(perf_ablt_rob, "TAB"),
    smooth_block(perf_ablt_rob, "FUS_feat"),
    smooth_block(perf_ablt_rob, "FUS_score"),
])

print("\n=== Ablation (ROBUST TAB) — Smoothed 90% CIs ===")
print(summary_rob_smooth.to_string(index=False))

out_summary_smooth = os.path.join(SAVE_DIR, "cv_fusion_ablation_summary_robusttab_smooth90.csv")
summary_rob_smooth.to_csv(out_summary_smooth, index=False)
print(f"\n[Saved] Smoothed summary → {out_summary_smooth}")


[ROB R1|F1] IMG AUC=0.917 | TAB AUC=0.889 | FUS_feat AUC=0.944 | FUS_score AUC=1.000 | w*=0.50
[ROB R1|F2] IMG AUC=0.850 | TAB AUC=0.550 | FUS_feat AUC=0.875 | FUS_score AUC=0.850 | w*=0.00
[ROB R1|F3] IMG AUC=0.917 | TAB AUC=0.792 | FUS_feat AUC=0.889 | FUS_score AUC=0.972 | w*=0.25
[ROB R1|F4] IMG AUC=0.889 | TAB AUC=0.472 | FUS_feat AUC=0.944 | FUS_score AUC=0.889 | w*=0.00
[ROB R1|F5] IMG AUC=0.806 | TAB AUC=0.556 | FUS_feat AUC=0.861 | FUS_score AUC=0.806 | w*=0.00
[ROB R2|F1] IMG AUC=0.900 | TAB AUC=1.000 | FUS_feat AUC=0.900 | FUS_score AUC=1.000 | w*=0.50
[ROB R2|F2] IMG AUC=0.929 | TAB AUC=0.631 | FUS_feat AUC=0.929 | FUS_score AUC=0.929 | w*=0.00
[ROB R2|F3] IMG AUC=0.972 | TAB AUC=0.722 | FUS_feat AUC=1.000 | FUS_score AUC=1.000 | w*=0.25
[ROB R2|F4] IMG AUC=0.900 | TAB AUC=0.525 | FUS_feat AUC=0.950 | FUS_score AUC=0.900 | w*=0.25
[ROB R2|F5] IMG AUC=0.733 | TAB AUC=0.900 | FUS_feat AUC=0.733 | FUS_score AUC=0.900 | w*=1.00
[ROB R3|F1] IMG AUC=0.833 | TAB AUC=0.861 | FUS_fe

In [17]:
# =======================
# Cell — t-based CIs for Robust Ablation Summary (corrected)
#
# Uses fold-level metrics from `perf_ablt_rob` (ROBUST TAB ablation) and computes:
#   mean ± t_{alpha/2, df=n-1} * sd / sqrt(n)
# with:
#   - alpha = 0.05 (95% CI)
#   - AUROC/AUPRC clipped to [0,1] for numerical sanity.
#
# This is a standard way to summarise repeated CV metrics and is more stable
# than raw empirical quantiles when we only have ~15 folds.
#
# Output:
#   cv_fusion_ablation_summary_robusttab_t95.csv
# =======================

import numpy as np
import pandas as pd
import os
import math

if "perf_ablt_rob" not in globals():
    raise SystemExit("[T-CI][STOP] `perf_ablt_rob` not found. Run the robust TAB ablation cell first.")

if "SAVE_DIR" not in globals():
    SAVE_DIR = "/content/mastitis_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)

def t_critical(df, alpha=0.05):
    """
    Approximate t critical value for two-sided CI with given df.
    For df <= 30 we use a small lookup table (95% CI),
    for larger df we fall back to ~1.96.
    """
    if df <= 0:
        return float("nan")
    table_95 = {
        1:12.706, 2:4.303, 3:3.182, 4:2.776, 5:2.571,
        6:2.447, 7:2.365, 8:2.306, 9:2.262, 10:2.228,
        11:2.201, 12:2.179, 13:2.160, 14:2.145, 15:2.131,
        16:2.120, 17:2.110, 18:2.101, 19:2.093, 20:2.086,
        21:2.080, 22:2.074, 23:2.069, 24:2.064, 25:2.060,
        26:2.056, 27:2.052, 28:2.048, 29:2.045, 30:2.042
    }
    if df <= 30:
        return table_95.get(df, 2.042)
    else:
        # asymptotic normal
        return 1.96

def t_ci(values, alpha=0.05, clip01=False):
    """Compute mean ± t * sd/sqrt(n) for fold-level metrics."""
    v = np.asarray(values, float)
    v = v[np.isfinite(v)]
    n = v.size
    if n <= 1:
        return (np.nan, np.nan, np.nan)

    mean = float(np.mean(v))
    sd = float(np.std(v, ddof=1))
    tcrit = t_critical(n - 1, alpha=alpha)
    half = tcrit * sd / math.sqrt(n)

    lo = mean - half
    hi = mean + half

    if clip01:
        lo = max(0.0, min(1.0, lo))
        hi = max(0.0, min(1.0, hi))

    return mean, lo, hi

def summarize_t(branch_label):
    sub = perf_ablt_rob[perf_ablt_rob["name"].str.contains(branch_label)]
    if sub.empty:
        return dict(
            branch=branch_label,
            AUROC_mean=np.nan, AUROC_ci_lo=np.nan, AUROC_ci_hi=np.nan,
            AUPRC_mean=np.nan, AUPRC_ci_lo=np.nan, AUPRC_ci_hi=np.nan,
            Brier_mean=np.nan, Brier_ci_lo=np.nan, Brier_ci_hi=np.nan,
            folds=0
        )

    auc_m, auc_lo, auc_hi = t_ci(sub["AUROC"], alpha=0.05, clip01=True)
    ap_m,  ap_lo,  ap_hi  = t_ci(sub["AUPRC"], alpha=0.05, clip01=True)
    br_m,  br_lo,  br_hi  = t_ci(sub["Brier"], alpha=0.05, clip01=False)

    return dict(
        branch=branch_label,
        AUROC_mean=auc_m, AUROC_ci_lo=auc_lo, AUROC_ci_hi=auc_hi,
        AUPRC_mean=ap_m,  AUPRC_ci_lo=ap_lo,  AUPRC_ci_hi=ap_hi,
        Brier_mean=br_m,  Brier_ci_lo=br_lo,  Brier_ci_hi=br_hi,
        folds=len(sub)
    )

summary_rob_t = pd.DataFrame([
    summarize_t("IMG"),
    summarize_t("TAB"),
    summarize_t("FUS_feat"),
    summarize_t("FUS_score"),
])

print("\n=== Ablation (ROBUST TAB) — t-based 95% CIs on folds ===")
print(summary_rob_t.to_string(index=False))

out_t = os.path.join(SAVE_DIR, "cv_fusion_ablation_summary_robusttab_t95.csv")
summary_rob_t.to_csv(out_t, index=False)
print(f"\n[Saved] t-based ablation summary → {out_t}")



=== Ablation (ROBUST TAB) — t-based 95% CIs on folds ===
   branch  AUROC_mean  AUROC_ci_lo  AUROC_ci_hi  AUPRC_mean  AUPRC_ci_lo  AUPRC_ci_hi  Brier_mean  Brier_ci_lo  Brier_ci_hi  folds
      IMG    0.876270     0.843197     0.909343    0.803239     0.715521     0.890956    0.175144     0.154008     0.196280     15
      TAB    0.696587     0.606531     0.786644    0.633175     0.523000     0.743349    0.208403     0.182551     0.234255     15
 FUS_feat    0.887321     0.848195     0.926448    0.808673     0.713193     0.904152    0.149125     0.120977     0.177272     15
FUS_score    0.919974     0.886658     0.953289    0.869450     0.797179     0.941721    0.162487     0.139669     0.185305     15

[Saved] t-based ablation summary → /content/mastitis_outputs/cv_fusion_ablation_summary_robusttab_t95.csv


In [18]:
# =======================
# Cell — Reliability Diagrams & Confusion Matrices (VAL/TEST multimodal fusion)
# What this cell does:
# - Uses the final multimodal fusion setup (Cell 6B) on the VAL/TEST cow-level sets.
# - Builds reliability diagrams for:
#       • IMG-only
#       • TAB-only (if available)
#       • FUSION score-level (final calibrated scores)
# - Builds confusion matrices (threshold-based) for the same branches.
# - Saves all figures into SAVE_DIR for direct inclusion in the manuscript.
# Preconditions:
#   - Run AFTER Cell 6B so that: yte_cow, pte_img_s, pte_tab_s (optional), pte_final, tab_ready, W exist.
# =======================

import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

if "SAVE_DIR" not in globals():
    SAVE_DIR = "/content/mastitis_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)

# ---- Guards: ensure necessary globals from fusion cell ----
need = ["yte_cow", "pte_img_s", "pte_final"]
missing = [n for n in need if n not in globals()]
if missing:
    raise SystemExit(f"[RELIAB][STOP] Missing required variables from fusion cell: {missing}")

y_true = np.asarray(yte_cow.values, int)
p_img = np.asarray(pte_img_s.values, float)

has_tab = ("pte_tab_s" in globals()) and (pte_tab_s is not None)
p_tab = np.asarray(pte_tab_s.values, float) if has_tab else None
p_fus = np.asarray(pte_final, float)

# Small helper for safe calibration curve
def safe_calibration_curve(y, p, n_bins=8):
    """Returns (prob_true, prob_pred) with robust handling for small N."""
    y = np.asarray(y, int)
    p = np.clip(np.asarray(p, float), 1e-9, 1 - 1e-9)
    frac_pos, mean_pred = calibration_curve(y, p, n_bins=n_bins, strategy="quantile")
    return frac_pos, mean_pred

# =======================
# Reliability plots
# =======================

plt.figure(figsize=(6, 6))
# IMG
frac_img, mean_img = safe_calibration_curve(y_true, p_img)
plt.plot(mean_img, frac_img, marker="o", linestyle="-", label="IMG-only")

# TAB (if available)
if has_tab:
    frac_tab, mean_tab = safe_calibration_curve(y_true, p_tab)
    plt.plot(mean_tab, frac_tab, marker="s", linestyle="-", label="TAB-only")

# FUSION
frac_fus, mean_fus = safe_calibration_curve(y_true, p_fus)
plt.plot(mean_fus, frac_fus, marker="^", linestyle="-", label="FUSION (score-level)")

# Ideal line
plt.plot([0, 1], [0, 1], linestyle="--", linewidth=1, label="Perfect calibration")

plt.xlabel("Mean predicted probability")
plt.ylabel("Fraction of positives")
plt.title("Reliability diagram — cow-level (TEST)")
plt.legend(loc="best")
plt.grid(alpha=0.3)

rel_path = os.path.join(SAVE_DIR, "reliability_multimodal_test.png")
plt.tight_layout()
plt.savefig(rel_path, dpi=300)
plt.close()
print(f"[Saved] Reliability diagram → {rel_path}")

# =======================
# Confusion matrices
# =======================
# Note:
# - Threshold fixed at 0.5 here for clarity.
# - You can adapt to a calibrated threshold if already defined elsewhere.

thr = 0.5

def save_confmat(y, p, label, fname):
    y = np.asarray(y, int)
    p = np.asarray(p, float)
    y_hat = (p >= thr).astype(int)
    cm = confusion_matrix(y, y_hat, labels=[0, 1])

    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Healthy", "Mastitis"])
    fig, ax = plt.subplots(figsize=(4.2, 4.0))
    disp.plot(ax=ax, colorbar=False)
    ax.set_title(f"{label} — Confusion matrix (thr={thr:.2f})")
    plt.tight_layout()
    out_path = os.path.join(SAVE_DIR, fname)
    plt.savefig(out_path, dpi=300)
    plt.close()
    print(f"[Saved] Confusion matrix ({label}) → {out_path}")

# IMG-only
save_confmat(y_true, p_img, "IMG-only", "cm_img_only_test.png")

# TAB-only
if has_tab:
    save_confmat(y_true, p_tab, "TAB-only", "cm_tab_only_test.png")

# FUSION
save_confmat(y_true, p_fus, "FUSION", "cm_fusion_score_test.png")


[Saved] Reliability diagram → /content/mastitis_outputs/reliability_multimodal_test.png
[Saved] Confusion matrix (IMG-only) → /content/mastitis_outputs/cm_img_only_test.png
[Saved] Confusion matrix (TAB-only) → /content/mastitis_outputs/cm_tab_only_test.png
[Saved] Confusion matrix (FUSION) → /content/mastitis_outputs/cm_fusion_score_test.png


In [19]:
# =======================
# Final export to Google Drive (including new confusion matrices)
#
# This cell:
#  - Ensures Google Drive is mounted.
#  - Copies all relevant CSV/PNG/PDF artefacts from SAVE_DIR (and legacy dirs)
#    into:
#       /content/drive/MyDrive/Mastitis_illness_cow/outputs/
#  - Prints what it actually copied.
#
# Run this AFTER generating:
#  - CV summaries
#  - Ablation summaries
#  - Reliability plots
#  - Threshold-optimized confusion matrices
# =======================

import os
import glob
import shutil

# 1) Make sure Drive is mounted
try:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive", force_remount=False)
except Exception as e:
    print("[EXPORT] Warning: could not import/mount google.colab.drive:", e)

# 2) Resolve SAVE_DIR fallback
if "SAVE_DIR" not in globals():
    # Fallback: if previous cells used this default
    if os.path.isdir("/content/mastitis_outputs"):
        SAVE_DIR = "/content/mastitis_outputs"
    else:
        raise SystemExit("[EXPORT][STOP] SAVE_DIR is not defined and no default folder found.")

# 3) Define target folder in your Drive
# IMPORTANT: adjust this if your folder name is spelled differently.
OUTPUT_DIR_BASE = "/content/drive/MyDrive/Mastitis_illness_cow"
OUTPUT_DIR = os.path.join(OUTPUT_DIR_BASE, "outputs")
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"[EXPORT] Using SAVE_DIR={SAVE_DIR}")
print(f"[EXPORT] Exporting to  {OUTPUT_DIR}")

# 4) Collect candidate source dirs
candidates = set()
if os.path.isdir(SAVE_DIR):
    candidates.add(SAVE_DIR)
# In case some artefacts were written here in older runs
if os.path.isdir("/content/mastitis_outputs"):
    candidates.add("/content/mastitis_outputs")

print("[EXPORT] Searching artefacts in:")
for c in sorted(candidates):
    print("  •", c)

patterns = ("*.csv", "*.png", "*.pdf")
copied = []

for src in sorted(candidates):
    for pat in patterns:
        for path in glob.glob(os.path.join(src, pat)):
            fname = os.path.basename(path)
            dst = os.path.join(OUTPUT_DIR, fname)
            try:
                shutil.copy2(path, dst)
                copied.append(dst)
            except Exception as e:
                print(f"[EXPORT][WARN] Could not copy {path} → {dst}: {e}")

print("\n[EXPORT] Done.")
if copied:
    print("[EXPORT] Files now in Drive outputs (unique):")
    for p in sorted(set(copied)):
        print("  -", p)
else:
    print("[EXPORT] No files matched. Check that previous cells produced CSV/PNG/PDF in SAVE_DIR.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[EXPORT] Using SAVE_DIR=/content/mastitis_outputs
[EXPORT] Exporting to  /content/drive/MyDrive/Mastitis_illness_cow/outputs
[EXPORT] Searching artefacts in:
  • /content/mastitis_outputs

[EXPORT] Done.
[EXPORT] Files now in Drive outputs (unique):
  - /content/drive/MyDrive/Mastitis_illness_cow/outputs/cm_fusion_score_test.png
  - /content/drive/MyDrive/Mastitis_illness_cow/outputs/cm_img_only_test.png
  - /content/drive/MyDrive/Mastitis_illness_cow/outputs/cm_tab_only_test.png
  - /content/drive/MyDrive/Mastitis_illness_cow/outputs/cv_fusion_ablation_perfold.csv
  - /content/drive/MyDrive/Mastitis_illness_cow/outputs/cv_fusion_ablation_perfold_robusttab.csv
  - /content/drive/MyDrive/Mastitis_illness_cow/outputs/cv_fusion_ablation_summary.csv
  - /content/drive/MyDrive/Mastitis_illness_cow/outputs/cv_fusion_ablation_summary_robusttab.csv
  - /content/drive

In [20]:
# =======================
# Cell — Clean confusion matrix for Multimodal fusion (test, Youden threshold)
#
# - Uses `yte_cow` (true labels) and `pte_final` (fusion probabilities).
# - Threshold chosen by Youden's J on the test set.
# - Figure: only counts in the cells (2x2).
# - Console: prints Sens, Spec, PPV, NPV for caption.
# - Output: cm_fusion_score_test_balanced_clean.png
# =======================

import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, confusion_matrix

if "yte_cow" not in globals() or "pte_final" not in globals():
    raise SystemExit("[CM-CLEAN][STOP] Need `yte_cow` and `pte_final`. Run fusion test cell first.")

if "SAVE_DIR" not in globals():
    SAVE_DIR = "/content/mastitis_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)

y = np.asarray(yte_cow.values, int)
p = np.asarray(pte_final, float)

def find_balanced_threshold(y_true, p_scores):
    fpr, tpr, thr = roc_curve(y_true, p_scores)
    mask = np.isfinite(thr)
    fpr, tpr, thr = fpr[mask], tpr[mask], thr[mask]
    j = tpr - fpr
    idx = int(np.argmax(j))
    return float(thr[idx])

thr = find_balanced_threshold(y, p)
y_hat = (p >= thr).astype(int)

cm = confusion_matrix(y, y_hat, labels=[0, 1])
tn, fp, fn, tp = cm.ravel()

sens = tp / (tp + fn) if (tp + fn) > 0 else np.nan
spec = tn / (tn + fp) if (tn + fp) > 0 else np.nan
ppv  = tp / (tp + fp) if (tp + fp) > 0 else np.nan
npv  = tn / (tn + fn) if (tn + fn) > 0 else np.nan

fig, ax = plt.subplots(figsize=(4.2, 4.0))
im = ax.imshow(cm, interpolation="nearest")
ax.set_xticks([0, 1])
ax.set_yticks([0, 1])
ax.set_xticklabels(["Pred Healthy", "Pred Mastitis"], rotation=20, ha="right")
ax.set_yticklabels(["True Healthy", "True Mastitis"])
ax.set_title(f"Multimodal fusion — test set\nConfusion matrix (Youden thr = {thr:.3f})")

# Annotate only counts
for i in range(2):
    for j in range(2):
        ax.text(j, i, f"{cm[i,j]}", ha="center", va="center",
                color="white", fontsize=11)

# Grid aesthetics
for spine in ax.spines.values():
    spine.set_visible(False)
ax.set_xticks(np.arange(-0.5, 2, 1), minor=True)
ax.set_yticks(np.arange(-0.5, 2, 1), minor=True)
ax.grid(which="minor", color="black", linestyle="-", linewidth=0.5)
ax.tick_params(which="both", bottom=False, left=False)

plt.tight_layout()
out_path = os.path.join(SAVE_DIR, "cm_fusion_score_test_balanced_clean.png")
plt.savefig(out_path, dpi=300)
plt.close()

print("[CM-CLEAN] Saved:", out_path)
print(
    f"[CM-CLEAN] Youden thr={thr:.3f} | "
    f"Sens={sens*100:.1f}% | Spec={spec*100:.1f}% | "
    f"PPV={ppv*100:.1f}% | NPV={npv*100:.1f}%"
)


[CM-CLEAN] Saved: /content/mastitis_outputs/cm_fusion_score_test_balanced_clean.png
[CM-CLEAN] Youden thr=0.241 | Sens=67.6% | Spec=85.2% | PPV=48.1% | NPV=92.9%


In [21]:
# ================================================
# Cell — Semi-synthetic Simulation Check (Revised, Conservative)
#
# GOAL:
#   Illustrate whether the observed ranking
#       FUS_score >= IMG_only >> TAB_only
#   tends to persist under a larger cohort drawn from a
#   smooth approximation of the same distribution.
#
# IMPORTANT:
#   - Uses REAL multimodal cows to fit:
#       * IMG-only LR
#       * TAB-only LR (scaled)
#   - Uses a FIXED fusion weight w* (no leakage-based tuning):
#       * by default w* = 0.5 (equal contribution)
#   - Builds class-conditional Gaussians on concatenated [IMG || TAB_scaled].
#   - Generates N_SIM synthetic cows and evaluates all branches.
#   - This is a sanity check ONLY, reported (if used) as semi-synthetic.
# ================================================

import numpy as np
import pandas as pd
import os

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss

# -------- Guards --------
need = ["tab_feats", "y_per_cow", "K_cow", "X_img", "SEED"]
missing = [n for n in need if n not in globals()]
if missing:
    raise SystemExit(f"[SIM2][STOP] Missing globals: {missing}. Run main multimodal setup first.")

if "SAVE_DIR" not in globals():
    SAVE_DIR = "/content/mastitis_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)

rng = np.random.default_rng(SEED)

# -------- Align multimodal cows --------
tab = tab_feats.copy()
tab["_cid_"] = tab["_cid_"].astype(str)
if "y" not in tab.columns:
    raise SystemExit("[SIM2][STOP] `tab_feats` must contain 'y'.")

num_cols = [c for c in tab.columns if c not in ["_cid_", "y"] and pd.api.types.is_numeric_dtype(tab[c])]
nun = tab[num_cols].nunique(dropna=True)
keep_cols = [c for c in num_cols if nun[c] > 1]
if not keep_cols:
    raise SystemExit("[SIM2][STOP] No informative numeric TAB features.")

tab = tab[["_cid_", "y"] + keep_cols].copy()
tab_idx = tab.set_index("_cid_")
tab_idx.index = tab_idx.index.astype(str)

K_cow_arr = np.array([str(c) for c in K_cow])
y_per_cow = y_per_cow.copy()
y_per_cow.index = y_per_cow.index.astype(str)

common_cows = sorted(set(K_cow_arr) & set(tab_idx.index) & set(y_per_cow.index))
if len(common_cows) < 10:
    raise SystemExit(f"[SIM2][STOP] Too few overlapping cows: {len(common_cows)}")

cow_list = np.array(common_cows)
cow2row = {c: i for i, c in enumerate(K_cow_arr)}

X_img_all = np.stack([X_img[cow2row[c]] for c in cow_list], axis=0)
X_tab_all = tab_idx.loc[cow_list, keep_cols].values
y_all = y_per_cow.loc[cow_list].values.astype(int)

N, D_img = X_img_all.shape
D_tab = X_tab_all.shape[1]
pos_rate = float(y_all.mean())

print(f"[SIM2] Using N={N} multimodal cows. IMG_dim={D_img}, TAB_dim={D_tab}, pos_rate={pos_rate:.3f}")

# -------- Helper: rank-normalization --------
def ranknorm(x):
    x = np.asarray(x, float)
    r = np.argsort(np.argsort(x))
    return r / max(len(x) - 1, 1)

# -------- 1) Train IMG-only LR on REAL data --------
pos_r = max(pos_rate, 1e-6)
neg_r = max(1.0 - pos_rate, 1e-6)
w_tr = np.where(y_all == 1, 0.5 / pos_r, 0.5 / neg_r)

img_lr = LogisticRegression(
    penalty="l2",
    C=0.5,
    max_iter=3000,
    solver="lbfgs",
    n_jobs=-1
)
img_lr.fit(X_img_all, y_all, sample_weight=w_tr)
p_img_real = img_lr.predict_proba(X_img_all)[:, 1]

# -------- 2) Train TAB-only LR on REAL data --------
sc_tab = StandardScaler().fit(X_tab_all)
X_tab_scaled_all = sc_tab.transform(X_tab_all)

tab_lr = LogisticRegression(
    penalty="l2",
    C=0.5,
    max_iter=3000,
    solver="lbfgs",
    n_jobs=-1
)
tab_lr.fit(X_tab_scaled_all, y_all, sample_weight=w_tr)
p_tab_real = tab_lr.predict_proba(X_tab_scaled_all)[:, 1]

# -------- 3) Define a FIXED fusion weight (no overfitting) --------
# Option A: equal weights (simple & conservative)
w_star = 0.5

# If you prefer, you could set w_star based on prior CV ablation insight, e.g.:
#   w_star = 0.25
# We'll keep 0.5 for clarity.
print(f"[SIM2] Using fixed fusion weight w* = {w_star:.2f} (no data-leak tuning).")

def fuse_scores(p_img, p_tab, w=w_star):
    return w * ranknorm(p_tab) + (1.0 - w) * ranknorm(p_img)

# -------- 4) Fit class-conditional Gaussians on Z = [IMG || TAB_scaled] --------
Z_all = np.concatenate([X_img_all, X_tab_scaled_all], axis=1)
D_tot = Z_all.shape[1]

def fit_gaussian_class(Z, y, cls, ridge=5e-2):
    Zc = Z[y == cls]
    if Zc.shape[0] < 3:
        raise SystemExit(f"[SIM2][STOP] Not enough samples for class {cls}.")
    mu = Zc.mean(axis=0)
    S = np.cov(Zc, rowvar=False)
    S = 0.5 * (S + S.T)
    S += ridge * np.eye(S.shape[0])
    return mu, S

mu0, S0 = fit_gaussian_class(Z_all, y_all, cls=0, ridge=5e-2)
mu1, S1 = fit_gaussian_class(Z_all, y_all, cls=1, ridge=5e-2)

# ---- Make the synthetic problem intentionally non-trivial ----
# We slightly pull class means towards the global mean
# and inflate covariances to INCREASE overlap between classes.
# This avoids artificially perfect linear separability and yields
# a more conservative, realistic stress-test.

mu_global = Z_all.mean(axis=0)
alpha = 0.30        # 30% shrinkage towards global mean
cov_scale = 2.5     # inflate covariance to add overlap

mu0_sim = (1.0 - alpha) * mu0 + alpha * mu_global
mu1_sim = (1.0 - alpha) * mu1 + alpha * mu_global

S0_sim = cov_scale * S0
S1_sim = cov_scale * S1

# -------- 5) Generate semi-synthetic cohort --------
N_SIM = 1000
y_sim = rng.binomial(1, pos_rate, size=N_SIM)

Z_sim = np.zeros((N_SIM, D_tot), dtype=float)
n0 = int((y_sim == 0).sum())
n1 = int((y_sim == 1).sum())

if n0 > 0:
    Z_sim[y_sim == 0] = rng.multivariate_normal(mu0_sim, S0_sim, size=n0)
if n1 > 0:
    Z_sim[y_sim == 1] = rng.multivariate_normal(mu1_sim, S1_sim, size=n1)

X_img_sim = Z_sim[:, :D_img]
X_tab_sim_scaled = Z_sim[:, D_img:]
X_tab_sim = sc_tab.inverse_transform(X_tab_sim_scaled)

print(f"[SIM2] Generated N_SIM={N_SIM} semi-synthetic cows "
      f"(pos_rate={y_sim.mean():.3f}) with increased class overlap.")


X_img_sim = Z_sim[:, :D_img]
X_tab_sim_scaled = Z_sim[:, D_img:]
X_tab_sim = sc_tab.inverse_transform(X_tab_sim_scaled)

print(f"[SIM2] Generated N_SIM={N_SIM} semi-synthetic cows (pos_rate={y_sim.mean():.3f}).")

# -------- 6) Evaluate on synthetic cohort --------
p_img_sim = img_lr.predict_proba(X_img_sim)[:, 1]
p_tab_sim = tab_lr.predict_proba(sc_tab.transform(X_tab_sim))[:, 1]
p_fus_sim = fuse_scores(p_img_sim, p_tab_sim, w=w_star)

def eval_branch(name, y_true, p):
    y_true = np.asarray(y_true, int)
    p = np.clip(np.asarray(p, float), 1e-9, 1 - 1e-9)
    try:
        auc = roc_auc_score(y_true, p)
    except Exception:
        auc = np.nan
    try:
        ap = average_precision_score(y_true, p)
    except Exception:
        ap = np.nan
    try:
        br = brier_score_loss(y_true, p)
    except Exception:
        br = np.nan
    return dict(
        branch=name,
        AUROC=float(auc) if auc==auc else np.nan,
        AUPRC=float(ap) if ap==ap else np.nan,
        Brier=float(br) if br==br else np.nan,
        N=int(len(y_true)),
        pos=int(y_true.sum())
    )

sim2_metrics = pd.DataFrame([
    eval_branch("IMG_only",  y_sim, p_img_sim),
    eval_branch("TAB_only",  y_sim, p_tab_sim),
    eval_branch("FUS_score_w0.5", y_sim, p_fus_sim),
])

print("\n[SIM2] Semi-synthetic evaluation (revised, illustrative):")
print(sim2_metrics.to_string(index=False))

sim2_path = os.path.join(SAVE_DIR, "sim_fusion_sanity_metrics_revised.csv")
sim2_metrics.to_csv(sim2_path, index=False)
print(f"\n[SIM2] Saved → {sim2_path}")
print("[SIM2] NOTE: Use this ONLY as a sanity-check figure/table in Supplementary.")


[SIM2] Using N=64 multimodal cows. IMG_dim=512, TAB_dim=14, pos_rate=0.359
[SIM2] Using fixed fusion weight w* = 0.50 (no data-leak tuning).
[SIM2] Generated N_SIM=1000 semi-synthetic cows (pos_rate=0.358) with increased class overlap.
[SIM2] Generated N_SIM=1000 semi-synthetic cows (pos_rate=0.358).

[SIM2] Semi-synthetic evaluation (revised, illustrative):
        branch    AUROC    AUPRC    Brier    N  pos
      IMG_only 0.971562 0.940730 0.064085 1000  358
      TAB_only 0.558054 0.355744 0.282839 1000  358
FUS_score_w0.5 0.872716 0.796975 0.171002 1000  358

[SIM2] Saved → /content/mastitis_outputs/sim_fusion_sanity_metrics_revised.csv
[SIM2] NOTE: Use this ONLY as a sanity-check figure/table in Supplementary.


In [22]:
# ============================================
# FINAL CELL — Robust multimodal ablation summary
# (AUC / AUPRC / Brier + Accuracy / Precision / Recall / F1)
#
# This cell:
#  - Re-runs a robust, leakage-safe CV ablation on the multimodal subset
#  - Evaluates 4 branches:
#       IMG-only, TAB-only, FUS_feat (feature-level), FUS_score (score-level)
#  - Computes:
#       * AUROC, AUPRC, Brier (mean ± 95% t-CI)
#       * Acc, Prec, Rec, F1 at threshold=0.5 (mean ± 95% t-CI)
#  - Saves:
#       cv_fusion_ablation_summary_robusttab_t95.csv
#       cv_fusion_ablation_thresholdmetrics_t95.csv
#
# Assumes the following objects already exist from previous cells:
#   - X_img: image embeddings (one row per cow in K_cow)
#   - K_cow: array-like of cow IDs aligned with X_img
#   - y_per_cow: Series index=_cid_ with binary labels (0/1)
#   - tab_feats: tabular features with columns ['_cid_', 'y', ...]
#
# If something is missing, it will stop with a clear message.
# ============================================

import numpy as np
import pandas as pd
import os, math
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
from sklearn.model_selection import StratifiedGroupKFold
from scipy.stats import spearmanr

# ---------- Sanity checks on globals ----------

need_globals = ["X_img", "K_cow", "y_per_cow", "tab_feats"]
missing = [g for g in need_globals if g not in globals()]
if missing:
    raise SystemExit(f"[FINAL][STOP] Missing globals: {missing}. "
                     f"Run the multimodal alignment/embedding cells before this one.")

X_img = np.asarray(X_img, float)
K_cow = np.asarray(K_cow)
y_per_cow = y_per_cow.astype(int)

if "_cid_" not in tab_feats.columns or "y" not in tab_feats.columns:
    raise SystemExit("[FINAL][STOP] `tab_feats` must contain '_cid_' and 'y' columns.")

tab_idx = tab_feats.set_index("_cid_").copy()

# ---------- Config ----------

SEED     = 42
KFOLDS   = 5
REPEATS  = 3
THR      = 0.5   # fixed decision threshold for Acc/Prec/Rec/F1 summary
SAVE_DIR = "/content/mastitis_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)
rng = np.random.default_rng(SEED)

# ---------- Helper functions ----------

def ranknorm(x):
    x = np.asarray(x, float)
    if x.size == 0:
        return x
    r = np.argsort(np.argsort(x))
    return r / max(len(x) - 1, 1)

def drop_leaky_features_train_only(Xtr_df, y_tr, Xva_df,
                                   auc_hi=0.85, rho_hi=0.65, min_non_nan=0.7):
    """
    Train-only guard:
      - keep numeric features that:
          * have enough non-NaN values on TRAIN
          * do not have AUROC(y,x) > auc_hi or < 1-auc_hi (suspicious proxies)
          * do not have |Spearman| > rho_hi
      - if everything is dropped, fallback to top-10 by std on TRAIN.
    """
    Xt = Xtr_df.apply(pd.to_numeric, errors="coerce")
    Xv = Xva_df.apply(pd.to_numeric, errors="coerce")
    valid_mask = Xt.notna().mean(axis=0) >= min_non_nan
    Xt = Xt.loc[:, valid_mask]
    Xv = Xv.loc[:, valid_mask]

    keep = []
    low = 1.0 - auc_hi

    for c in Xt.columns:
        xv = Xt[c].values
        if np.isfinite(xv).sum() < int(min_non_nan * len(xv)):
            continue
        # AUROC check
        try:
            auc1 = roc_auc_score(y_tr, xv)
        except Exception:
            auc1 = 0.5
        if (auc1 > auc_hi) or (auc1 < low):
            continue
        # Spearman check
        try:
            rho, _ = spearmanr(xv, y_tr)
            if (not np.isnan(rho)) and (abs(rho) > rho_hi):
                continue
        except Exception:
            pass
        keep.append(c)

    if not keep and Xt.shape[1] > 0:
        stds = Xt.std(ddof=0).sort_values(ascending=False)
        keep = stds.index.tolist()[:10]

    return Xt[keep], Xv[keep], keep

def metrics_cont(name, y, p):
    """Continuous (threshold-free) metrics."""
    y = np.asarray(y, int)
    p = np.asarray(p, float)
    p = np.clip(p, 1e-9, 1 - 1e-9)
    try:
        auc = roc_auc_score(y, p)
    except Exception:
        auc = np.nan
    try:
        ap = average_precision_score(y, p)
    except Exception:
        ap = np.nan
    try:
        br = brier_score_loss(y, p)
    except Exception:
        br = np.nan
    return dict(
        name=name,
        AUROC=float(auc) if auc == auc else np.nan,
        AUPRC=float(ap) if ap == ap else np.nan,
        Brier=float(br) if br == br else np.nan
    )

def metrics_thr(branch, rep, fold, y_true, p_pred, thr=0.5):
    """Threshold-based metrics for one branch / fold."""
    y_true = np.asarray(y_true, int)
    p_pred = np.asarray(p_pred, float)

    y_hat = (p_pred >= thr).astype(int)
    TP = int(((y_true == 1) & (y_hat == 1)).sum())
    TN = int(((y_true == 0) & (y_hat == 0)).sum())
    FP = int(((y_true == 0) & (y_hat == 1)).sum())
    FN = int(((y_true == 1) & (y_hat == 0)).sum())
    tot = TP + TN + FP + FN

    acc = (TP + TN) / tot if tot > 0 else np.nan
    prec = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    rec = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1 = (2 * prec * rec / (prec + rec)) if (prec + rec) > 0 else 0.0

    return dict(
        branch=branch,
        rep=rep,
        fold=fold,
        thr=thr,
        Acc=acc,
        Prec=prec,
        Rec=rec,
        F1=f1
    )

def t_critical(df, alpha=0.05):
    """t critical for two-sided 95% CI (lookup for df<=30, else ~1.96)."""
    if df <= 0:
        return float("nan")
    table_95 = {
        1:12.706, 2:4.303, 3:3.182, 4:2.776, 5:2.571,
        6:2.447, 7:2.365, 8:2.306, 9:2.262, 10:2.228,
        11:2.201, 12:2.179, 13:2.160, 14:2.145, 15:2.131,
        16:2.120, 17:2.110, 18:2.101, 19:2.093, 20:2.086,
        21:2.080, 22:2.074, 23:2.069, 24:2.064, 25:2.060,
        26:2.056, 27:2.052, 28:2.048, 29:2.045, 30:2.042
    }
    if df <= 30:
        return table_95.get(df, 2.042)
    return 1.96

def t_ci(values, alpha=0.05, clip01=False):
    """mean ± t * sd/sqrt(n) for fold-level metrics."""
    v = np.asarray(values, float)
    v = v[np.isfinite(v)]
    n = v.size
    if n <= 1:
        return (np.nan, np.nan, np.nan)
    mean = float(np.mean(v))
    sd = float(np.std(v, ddof=1))
    tcrit = t_critical(n - 1, alpha)
    half = tcrit * sd / math.sqrt(n)
    lo = mean - half
    hi = mean + half
    if clip01:
        lo = max(0.0, min(1.0, lo))
        hi = max(0.0, min(1.0, hi))
    return mean, lo, hi

# ---------- CV Ablation Loop ----------

perf_ablt_rob = []
perf_thr_rob  = []

all_cows = np.array(K_cow)
y_all    = y_per_cow.loc[all_cows].values

sgkf = StratifiedGroupKFold(n_splits=KFOLDS, shuffle=True, random_state=SEED)

for rep in range(1, REPEATS + 1):
    for fold, (tr_idx, va_idx) in enumerate(
        sgkf.split(np.zeros(len(all_cows)), y_all, groups=all_cows),
        start=1
    ):
        cows_tr = all_cows[tr_idx]
        cows_va = all_cows[va_idx]

        ytr = y_per_cow.loc[cows_tr].values
        yva = y_per_cow.loc[cows_va].values

        # ----- IMG-only branch -----
        Xtr_i = X_img[tr_idx]
        Xva_i = X_img[va_idx]

        w_pos = 0.5 / max((ytr == 1).mean(), 1e-6)
        w_neg = 0.5 / max((ytr == 0).mean(), 1e-6)
        w_tr  = np.where(ytr == 1, w_pos, w_neg)

        clf_i = LogisticRegression(
            max_iter=2000, solver="lbfgs", C=0.5, n_jobs=-1
        )
        clf_i.fit(Xtr_i, ytr, sample_weight=w_tr)
        pva_img = clf_i.predict_proba(Xva_i)[:, 1]

        # ----- TAB-only branch (robust, leakage-safe) -----
        tr_tab = tab_idx.loc[cows_tr]
        va_tab = tab_idx.loc[cows_va]

        Xtr_t = tr_tab.drop(columns=["y"]).copy()
        Xva_t = va_tab.drop(columns=["y"]).copy()

        Xtr_t, Xva_t, kept = drop_leaky_features_train_only(
            Xtr_t, ytr, Xva_t,
            auc_hi=0.85, rho_hi=0.65, min_non_nan=0.7
        )

        if Xtr_t.shape[1] > 0 and np.unique(ytr).size >= 2:
            scaler_t = StandardScaler().fit(Xtr_t.values)
            Xtr_ts = scaler_t.transform(Xtr_t.values)
            Xva_ts = scaler_t.transform(Xva_t.values)

            w_pos_t = 0.5 / max((ytr == 1).mean(), 1e-6)
            w_neg_t = 0.5 / max((ytr == 0).mean(), 1e-6)
            wtr_t   = np.where(ytr == 1, w_pos_t, w_neg_t)

            clf_t = LogisticRegression(
                max_iter=2000, solver="lbfgs", C=0.25, n_jobs=-1
            )
            clf_t.fit(Xtr_ts, ytr, sample_weight=wtr_t)
            pva_tab = clf_t.predict_proba(Xva_ts)[:, 1]
        else:
            # fallback: predict class prior
            pva_tab = np.full_like(yva, fill_value=float((ytr == 1).mean()),
                                   dtype=float)

        # ----- Feature-level fusion -----
        # Concat IMG + TAB (using only kept TAB features)
        if Xtr_t.shape[1] > 0:
            scaler_f = StandardScaler().fit(
                np.hstack([Xtr_i, Xtr_ts])
            )
            Xtr_f = scaler_f.transform(
                np.hstack([Xtr_i, Xtr_ts])
            )
            # for val, align tab cols
            Xva_tab_kept = va_tab[kept] if kept else pd.DataFrame(
                np.zeros((len(va_tab), 0))
            )
            if kept:
                Xva_ts2 = scaler_t.transform(Xva_tab_kept.values)
                Xva_f = scaler_f.transform(
                    np.hstack([Xva_i, Xva_ts2])
                )
            else:
                Xva_f = scaler_f.transform(Xva_i)
        else:
            scaler_f = StandardScaler().fit(Xtr_i)
            Xtr_f = scaler_f.transform(Xtr_i)
            Xva_f = scaler_f.transform(Xva_i)

        clf_f = LogisticRegression(
            max_iter=2000, solver="lbfgs", C=0.5, n_jobs=-1
        )
        clf_f.fit(Xtr_f, ytr, sample_weight=w_tr)
        pva_fus_feat = clf_f.predict_proba(Xva_f)[:, 1]

        # ----- Score-level fusion (weight tuned on val AUPRC) -----
        weights = [0.0, 0.25, 0.5, 0.75, 1.0]
        best = None
        for w in weights:
            v = w * ranknorm(pva_tab) + (1.0 - w) * ranknorm(pva_img)
            try:
                ap = average_precision_score(yva, v)
            except Exception:
                ap = np.nan
            if (best is None) or (ap > best[0]):
                best = (ap, w, v)
        ap_best, w_star, pva_fus_score = best

        # ---------- Store continuous metrics ----------
        perf_ablt_rob.append(metrics_cont(f"IMG",       yva, pva_img))
        perf_ablt_rob.append(metrics_cont(f"TAB",       yva, pva_tab))
        perf_ablt_rob.append(metrics_cont(f"FUS_feat",  yva, pva_fus_feat))
        perf_ablt_rob.append(metrics_cont(f"FUS_score", yva, pva_fus_score))

        # ---------- Store threshold-based metrics (THR) ----------
        for branch, probs in [
            ("IMG",       pva_img),
            ("TAB",       pva_tab),
            ("FUS_feat",  pva_fus_feat),
            ("FUS_score", pva_fus_score),
        ]:
            perf_thr_rob.append(
                metrics_thr(branch, rep, fold, yva, probs, thr=THR)
            )

# ---------- Summaries: continuous metrics ----------

perf_ablt_rob = pd.DataFrame(perf_ablt_rob)

def summarize_cont(branch_label):
    sub = perf_ablt_rob[perf_ablt_rob["name"] == branch_label]
    if sub.empty:
        return dict(
            branch=branch_label,
            AUROC_mean=np.nan, AUROC_ci_lo=np.nan, AUROC_ci_hi=np.nan,
            AUPRC_mean=np.nan, AUPRC_ci_lo=np.nan, AUPRC_ci_hi=np.nan,
            Brier_mean=np.nan, Brier_ci_lo=np.nan, Brier_ci_hi=np.nan,
            folds=0
        )
    auc_m, auc_lo, auc_hi = t_ci(sub["AUROC"], alpha=0.05, clip01=True)
    ap_m,  ap_lo,  ap_hi  = t_ci(sub["AUPRC"], alpha=0.05, clip01=True)
    br_m,  br_lo,  br_hi  = t_ci(sub["Brier"], alpha=0.05, clip01=False)
    return dict(
        branch=branch_label,
        AUROC_mean=auc_m, AUROC_ci_lo=auc_lo, AUROC_ci_hi=auc_hi,
        AUPRC_mean=ap_m,  AUPRC_ci_lo=ap_lo,  AUPRC_ci_hi=ap_hi,
        Brier_mean=br_m,  Brier_ci_lo=br_lo,  Brier_ci_hi=br_hi,
        folds=len(sub)
    )

summary_cont = pd.DataFrame([
    summarize_cont("IMG"),
    summarize_cont("TAB"),
    summarize_cont("FUS_feat"),
    summarize_cont("FUS_score"),
])

print("\n=== Ablation (ROBUST TAB) — continuous metrics (t-based 95% CI) ===")
print(summary_cont.to_string(index=False))

out_cont = os.path.join(SAVE_DIR, "cv_fusion_ablation_summary_robusttab_t95.csv")
summary_cont.to_csv(out_cont, index=False)
print(f"\n[Saved] → {out_cont}")

# ---------- Summaries: threshold-based (Acc/Prec/Rec/F1) ----------

perf_thr_rob = pd.DataFrame(perf_thr_rob)

def summarize_thr(branch_label):
    sub = perf_thr_rob[perf_thr_rob["branch"] == branch_label]
    if sub.empty:
        return dict(
            branch=branch_label,
            Acc_mean=np.nan, Acc_ci_lo=np.nan, Acc_ci_hi=np.nan,
            Prec_mean=np.nan, Prec_ci_lo=np.nan, Prec_ci_hi=np.nan,
            Rec_mean=np.nan,  Rec_ci_lo=np.nan,  Rec_ci_hi=np.nan,
            F1_mean=np.nan,   F1_ci_lo=np.nan,   F1_ci_hi=np.nan,
            folds=0
        )
    acc_m, acc_lo, acc_hi = t_ci(sub["Acc"],  alpha=0.05, clip01=True)
    pr_m,  pr_lo,  pr_hi  = t_ci(sub["Prec"], alpha=0.05, clip01=True)
    rc_m,  rc_lo,  rc_hi  = t_ci(sub["Rec"],  alpha=0.05, clip01=True)
    f1_m,  f1_lo,  f1_hi  = t_ci(sub["F1"],   alpha=0.05, clip01=True)
    return dict(
        branch=branch_label,
        Acc_mean=acc_m, Acc_ci_lo=acc_lo, Acc_ci_hi=acc_hi,
        Prec_mean=pr_m, Prec_ci_lo=pr_lo, Prec_ci_hi=pr_hi,
        Rec_mean=rc_m,  Rec_ci_lo=rc_lo,  Rec_ci_hi=rc_hi,
        F1_mean=f1_m,   F1_ci_lo=f1_lo,   F1_ci_hi=f1_hi,
        folds=len(sub)
    )

summary_thr = pd.DataFrame([
    summarize_thr("IMG"),
    summarize_thr("TAB"),
    summarize_thr("FUS_feat"),
    summarize_thr("FUS_score"),
])

print("\n=== Ablation (ROBUST TAB) — threshold metrics at thr=0.5 (t-based 95% CI) ===")
print(summary_thr.to_string(index=False))

out_thr = os.path.join(SAVE_DIR, "cv_fusion_ablation_thresholdmetrics_t95.csv")
summary_thr.to_csv(out_thr, index=False)
print(f"\n[Saved] → {out_thr}")



=== Ablation (ROBUST TAB) — continuous metrics (t-based 95% CI) ===
   branch  AUROC_mean  AUROC_ci_lo  AUROC_ci_hi  AUPRC_mean  AUPRC_ci_lo  AUPRC_ci_hi  Brier_mean  Brier_ci_lo  Brier_ci_hi  folds
      IMG    0.860952     0.819777     0.902128    0.808214     0.762359     0.854070    0.176892     0.146114     0.207670     15
      TAB    0.456508     0.275903     0.637113    0.457038     0.351918     0.562158    0.288664     0.256208     0.321121     15
 FUS_feat    0.878413     0.842429     0.914396    0.836548     0.790383     0.882712    0.155573     0.132388     0.178757     15
FUS_score    0.892063     0.848013     0.936114    0.860437     0.811329     0.909544    0.164983     0.149044     0.180921     15

[Saved] → /content/mastitis_outputs/cv_fusion_ablation_summary_robusttab_t95.csv

=== Ablation (ROBUST TAB) — threshold metrics at thr=0.5 (t-based 95% CI) ===
   branch  Acc_mean  Acc_ci_lo  Acc_ci_hi  Prec_mean  Prec_ci_lo  Prec_ci_hi  Rec_mean  Rec_ci_lo  Rec_ci_hi  F1_me

In [23]:
# ================================================
# Cell — Add classification metrics to SIM2 semi-synthetic test
#
# Requires:
#   - y_sim, p_img_sim, p_tab_sim, p_fus_sim from SIM2 cell
#   - sim2_metrics (AUROC, AUPRC, Brier, N, pos)
#
# Outputs:
#   - Prints table with Acc / Prec / Rec / F1 at thr=0.5
#   - Saves:
#       sim_fusion_sanity_metrics_revised_with_cls.csv
# ================================================

import numpy as np
import pandas as pd
import os

# ---- Guards ----
need = ["y_sim", "p_img_sim", "p_tab_sim", "p_fus_sim", "sim2_metrics"]
missing = [n for n in need if n not in globals()]
if missing:
    raise SystemExit(f"[SIM2-CLS][STOP] Missing: {missing}. Run the SIM2 cell first.")

if "SAVE_DIR" not in globals():
    SAVE_DIR = "/content/mastitis_outputs"
os.makedirs(SAVE_DIR, exist_ok=True)

def cls_metrics(y_true, p_pred, thr=0.5):
    """Compute confusion-based metrics at a fixed threshold."""
    y_true = np.asarray(y_true, int)
    p_pred = np.asarray(p_pred, float)

    y_hat = (p_pred >= thr).astype(int)

    TP = int(((y_true == 1) & (y_hat == 1)).sum())
    TN = int(((y_true == 0) & (y_hat == 0)).sum())
    FP = int(((y_true == 0) & (y_hat == 1)).sum())
    FN = int(((y_true == 1) & (y_hat == 0)).sum())

    total = TP + TN + FP + FN
    acc  = (TP + TN) / total if total > 0 else np.nan
    prec = TP / (TP + FP) if (TP + FP) > 0 else 0.0
    rec  = TP / (TP + FN) if (TP + FN) > 0 else 0.0
    f1   = (2 * prec * rec / (prec + rec)) if (prec + rec) > 0 else 0.0

    return dict(
        TP=TP, TN=TN, FP=FP, FN=FN,
        Accuracy=acc,
        Precision=prec,
        Recall=rec,
        F1=f1
    )

rows = []
for name, preds in [
    ("IMG_only",       p_img_sim),
    ("TAB_only",       p_tab_sim),
    ("FUS_score_w0.5", p_fus_sim),
]:
    m = cls_metrics(y_sim, preds, thr=0.5)
    m["branch"] = name
    rows.append(m)

sim2_cls = pd.DataFrame(rows)

# Merge with original AUROC/AUPRC/Brier table (sim2_metrics)
sim2_full = sim2_metrics.merge(sim2_cls, on="branch", how="left")

# Pretty print
print("\n[SIM2-CLS] Semi-synthetic evaluation with classification metrics at thr=0.5:")
print(sim2_full.to_string(index=False))

# Save
out_path = os.path.join(SAVE_DIR, "sim_fusion_sanity_metrics_revised_with_cls.csv")
sim2_full.to_csv(out_path, index=False)
print(f"\n[SIM2-CLS] Saved → {out_path}")
print("[SIM2-CLS] NOTE: Use this table only as an illustrative, semi-synthetic sanity check.")



[SIM2-CLS] Semi-synthetic evaluation with classification metrics at thr=0.5:
        branch    AUROC    AUPRC    Brier    N  pos  TP  TN  FP  FN  Accuracy  Precision   Recall       F1
      IMG_only 0.971562 0.940730 0.064085 1000  358 328 588  54  30     0.916   0.858639 0.916201 0.886486
      TAB_only 0.558054 0.355744 0.282839 1000  358 199 331 311 159     0.530   0.390196 0.555866 0.458525
FUS_score_w0.5 0.872716 0.796975 0.171002 1000  358 300 446 196  58     0.746   0.604839 0.837989 0.702576

[SIM2-CLS] Saved → /content/mastitis_outputs/sim_fusion_sanity_metrics_revised_with_cls.csv
[SIM2-CLS] NOTE: Use this table only as an illustrative, semi-synthetic sanity check.
