In [5]:
# antibiotic_direction_pipeline.py
# Unsupervised (KMeans) -> pseudo-labels (increase/decrease) -> Supervised classifier
# Works with a wide MIMIC-style table that includes DOSE_VAL_RX and abx one-hot flags.
# Requires: pandas, numpy, scikit-learn, joblib

import json
from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, f1_score, precision_recall_fscore_support,
    classification_report, confusion_matrix, silhouette_score
)
from sklearn.model_selection import train_test_split
import joblib

# -------------------------
# CONFIG
# -------------------------
DATA_PATH = "output_with_is_dead.csv"   # <- change if needed
RANDOM_STATE = 1337
DEFAULT_K = 4
TRY_PICK_K = True          # set False to force DEFAULT_K
K_CANDIDATES = [3, 4, 5, 6]
SIL_SAMPLE_N = 4000        # sample size for silhouette (to keep it fast)
MARGIN_PCT = 0.00          # 0.00 = label everyone; e.g., 0.05 ignores ±5% zone around median

ID_COLS_CANON = ["SUBJECT_ID", "HADM_ID", "ICUSTAY_ID"]
OUTCOME_COLS = ["is_dead"]   # always exclude from features
CANDIDATE_DOSE_COLS = [
    "dose_val_rx", "dose", "dosage", "dose_val", "abx_dose"
]
OPTIONAL_CATEGORICAL = ["Antibiotic_category"]  # one-hot encode if present

MODEL_PATH = "antibiotic_direction_pipeline.joblib"
SAMPLE_PRED_PATH = "sample_predictions_first200.csv"

# -------------------------
# UTILITIES
# -------------------------
def find_dose_col(df: pd.DataFrame) -> str:
    lower = {c.lower(): c for c in df.columns}
    for key in CANDIDATE_DOSE_COLS:
        if key in lower:
            return lower[key]
    raise ValueError(
        f"Could not find dose column; looked for any of {CANDIDATE_DOSE_COLS} "
        f"in columns={list(df.columns)[:10]}..."
    )

def build_preprocessor(df: pd.DataFrame, dose_col: str):
    # IDs & outcomes to drop from features
    id_cols = [c for c in df.columns if c.upper() in set(ID_COLS_CANON)]
    drop_cols = set(id_cols + OUTCOME_COLS + [dose_col])

    # numeric vs categorical features
    cat_cols = [c for c in OPTIONAL_CATEGORICAL if c in df.columns]
    num_cols = [c for c in df.select_dtypes(include=[np.number]).columns
                if c not in drop_cols and c not in cat_cols]

    # numeric pipeline
    numeric_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ])

    # categorical pipeline (rare in this table, but include if 'Antibiotic_category' exists)
    categorical_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocess = ColumnTransformer(
        transformers=[
            ("num", numeric_pipeline, num_cols),
            ("cat", categorical_pipeline, cat_cols),
        ],
        remainder="drop"
    )

    feature_cols = num_cols + cat_cols
    return preprocess, feature_cols, id_cols

def choose_k_by_silhouette(X_mat, k_values, sample_n=SIL_SAMPLE_N, random_state=RANDOM_STATE):
    # Sample to speed up silhouette computation
    if X_mat.shape[0] > sample_n:
        rng = np.random.default_rng(random_state)
        idx = rng.choice(X_mat.shape[0], size=sample_n, replace=False)
        X_eval = X_mat[idx]
    else:
        X_eval = X_mat

    best_k, best_sil = None, -1.0
    for k in k_values:
        km = KMeans(n_clusters=k, n_init=10, random_state=random_state)
        labels = km.fit_predict(X_eval)
        # Silhouette needs at least 2 clusters with >1 sample
        if len(set(labels)) < 2:
            continue
        score = silhouette_score(X_eval, labels, metric="euclidean")
        if score > best_sil:
            best_sil, best_k = score, k
    return best_k if best_k is not None else DEFAULT_K, best_sil

def make_labels_by_cluster_median(df: pd.DataFrame, dose_col: str, clusters: np.ndarray, margin_pct: float):
    # median per cluster
    tmp = df[[dose_col]].copy()
    tmp["_cluster"] = clusters
    med = tmp.groupby("_cluster")[dose_col].median()

    # Broadcast median, compute direction with optional margin
    df = df.copy()
    df["_cluster"] = clusters
    df["_cluster_median_dose"] = df["_cluster"].map(med)
    # Label rule: 1 = increase if dose < median * (1 - margin)
    #             0 = decrease otherwise
    median_adj = df["_cluster_median_dose"] * (1.0 - margin_pct)
    y = (df[dose_col] < median_adj).astype("Int64")  # keep NA where dose is NA
    # Optionally drop borderline rows when margin>0? We already used a < (1 - margin) threshold
    return y, med.to_dict()

# -------------------------
# MAIN FLOW
# -------------------------
def main():
    # 1) Load
    df = pd.read_csv(DATA_PATH, low_memory=False)
    dose_col = find_dose_col(df)
    df[dose_col] = pd.to_numeric(df[dose_col], errors="coerce")

    # 2) Preprocess builder
    preprocess, feature_cols, id_cols = build_preprocessor(df, dose_col)

    # 3) Fit transform features for KMeans
    X = df[feature_cols].copy()
    X_mat = preprocess.fit_transform(X)

    # 4) Pick k (optional) & fit KMeans
    if TRY_PICK_K:
        k, sil = choose_k_by_silhouette(X_mat, K_CANDIDATES)
        print(f"[k-pick] best k={k} (silhouette on sample ≈ {sil:.3f})")
    else:
        k = DEFAULT_K

    kmeans = KMeans(n_clusters=k, n_init=20, random_state=RANDOM_STATE)
    clusters = kmeans.fit_predict(X_mat)

    # 5) Create labels (requires dose)
    y_full, cluster_medians = make_labels_by_cluster_median(df, dose_col, clusters, margin_pct=MARGIN_PCT)

    # Keep rows that have a valid label (dose not null; label produced)
    mask_train = y_full.notna()
    X_tr_full = df.loc[mask_train, feature_cols].copy()
    y_tr_full = y_full.loc[mask_train].astype(int)

    # Refit preprocess on training subset (avoid minor leakage concerns)
    preprocess_tr = preprocess.fit(X_tr_full)
    X_tr_mat = preprocess_tr.transform(X_tr_full)

    # 6) Supervised train/val split
    X_tr, X_te, y_tr, y_te = train_test_split(
        X_tr_mat, y_tr_full, test_size=0.2, random_state=RANDOM_STATE, stratify=y_tr_full
    )

    # 7) Train classifier (baseline)
    clf = LogisticRegression(
        max_iter=1000, class_weight="balanced", solver="lbfgs", random_state=RANDOM_STATE
    )
    clf.fit(X_tr, y_tr)
    y_pred = clf.predict(X_te)

    acc = accuracy_score(y_te, y_pred)
    p, r, f1, _ = precision_recall_fscore_support(y_te, y_pred, average="binary")
    print("\n=== Supervised performance on pseudo-labels (held-out) ===")
    print(f"Accuracy:  {acc:.3f}")
    print(f"Precision: {p:.3f}")
    print(f"Recall:    {r:.3f}")
    print(f"F1:        {f1:.3f}")
    print("\nConfusion matrix:\n", confusion_matrix(y_te, y_pred))
    print("\nClassification report:\n", classification_report(y_te, y_pred, digits=3))

    # 8) Save artifacts
    bundle = {
        "preprocess": preprocess_tr,
        "kmeans": kmeans,
        "cluster_medians": cluster_medians,
        "clf": clf,
        "feature_cols": feature_cols,
        "dose_col": dose_col,
        "id_cols": id_cols,
        "k": k,
        "margin_pct": MARGIN_PCT,
    }
    joblib.dump(bundle, MODEL_PATH)
    print(f"\nSaved model bundle to: {MODEL_PATH}")

    # 9) Demo predictions for first 200 valid rows (maps 0/1 -> decrease/increase)
    #    (We use all rows with non-null features; labels are not required for prediction.)
    X_all_mat = preprocess_tr.transform(df[feature_cols])
    yhat_all = clf.predict(X_all_mat)
    dir_map = {0: "decrease", 1: "increase"}

    # ID columns if present, else empty
    if all(c in df.columns for c in ID_COLS_CANON):
        out = df[ID_COLS_CANON].copy()
    else:
        out = pd.DataFrame(index=df.index)

    out["pred_direction"] = pd.Series(yhat_all, index=df.index).map(dir_map)
    out.head(200).to_csv(SAMPLE_PRED_PATH, index=False)
    print(f"Wrote sample predictions to: {SAMPLE_PRED_PATH}")

if __name__ == "__main__":
    main()


[k-pick] best k=3 (silhouette on sample ≈ 0.864)

=== Supervised performance on pseudo-labels (held-out) ===
Accuracy:  0.934
Precision: 0.949
Recall:    0.859
F1:        0.902

Confusion matrix:
 [[2080   53]
 [ 163  991]]

Classification report:
               precision    recall  f1-score   support

           0      0.927     0.975     0.951      2133
           1      0.949     0.859     0.902      1154

    accuracy                          0.934      3287
   macro avg      0.938     0.917     0.926      3287
weighted avg      0.935     0.934     0.933      3287


Saved model bundle to: antibiotic_direction_pipeline.joblib
Wrote sample predictions to: sample_predictions_first200.csv


In [13]:
from make_pseudolabels import make_pseudolabels

# Use your actual CSV path here:
RAW = r"output_with_is_dead.csv"

pseudo_df, meta = make_pseudolabels(
    input_path=RAW,
    # If you omit these, files are written next to RAW:
    output_csv=r"pseudolabel_dataset.csv",
    meta_json=r"pseudolabel_meta.json",
    # Optional: you can pass comma strings too, e.g. "SUBJECT_ID,HADM_ID,ICUSTAY_ID"
    id_cols=["SUBJECT_ID","HADM_ID","ICUSTAY_ID"],
    drop_cols=["is_dead"],
    k=4,
    margin_pct=0.00,
    random_state=1337,
)

pseudo_df.head()


[Info] Reading: output_with_is_dead.csv
[Info] Shape: (16431, 135)
[Info] Using dose column: DOSE_VAL_RX
[Info] #numeric feature columns: 121
[Info] Clustering with MiniBatchKMeans(k=4) ...
[OK] Wrote pseudolabel dataset: C:\Users\VishalMaurya\Downloads\BTP\model\3rd Attempt\pseudolabel_dataset.csv
[OK] Wrote metadata JSON:       C:\Users\VishalMaurya\Downloads\BTP\model\3rd Attempt\pseudolabel_meta.json
[Info] Rows with pseudolabels:  16419 / 16431


Unnamed: 0,SUBJECT_ID,HADM_ID,ICUSTAY_ID,DOSE_VAL_RX,_cluster,_cluster_median_dose,abx__*NF* Ertapenem Sodium,abx__AMOXicillin Oral Susp.,abx__AMP,abx__Amikacin,...,abx__Vancomycin Intrathecal,abx__Vancomycin Oral Liquid,abx__ce,abx__demeclocycline,abx__fidaxomicin,abx__fosfomycin tromethamine,abx__moxifloxacin,abx__ofloxacin,_pseudo_label,_pseudo_label_str
0,33,176176,296681,1000.0,1,1000.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,decrease
1,33,176176,296681,500.0,1,1000.0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,increase
2,33,176176,296681,1000.0,1,1000.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,decrease
3,52,190797,261857,2000.0,1,1000.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,decrease
4,52,190797,261857,500.0,2,500.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,decrease


In [14]:
# --- Supervised learning on pseudolabels -> write actual predicted labels to CSV ---
# Works in Windows/Jupyter. No CLI needed. Just set the paths and run.

from pathlib import Path
import json
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import joblib

# ====== EDIT THESE PATHS ======
PSEUDO_CSV  = r"pseudolabel_dataset.csv"          # produced by your make_pseudolabels step
OUTPUT_CSV  = r"final_labels_supervised.csv"      # where to save predictions
MODEL_PATH  = r"antibiotic_direction_pipeline.joblib"  # optional: save model bundle
META_JSON   = r"pseudolabel_meta.json"            # optional: if present, used to lock feature set
PREDICT_ON  = None  # set to r"C:\path\to\original_or_new_batch.csv" to predict elsewhere; or keep None

# ====== LOADING ======
pseudo_df = pd.read_csv(PSEUDO_CSV, low_memory=False)
if "_pseudo_label" not in pseudo_df.columns:
    raise ValueError("pseudolabel CSV must contain a '_pseudo_label' column.")

# Optional: load feature list from meta JSON (if you created it earlier)
feat_cols_from_meta = None
try:
    meta_p = Path(META_JSON)
    if meta_p.exists():
        meta = json.loads(meta_p.read_text(encoding="utf-8"))
        feat_cols_from_meta = meta.get("feature_cols", None)
        # optional ids to include in output (if present)
        id_cols_from_meta = meta.get("id_cols", [])
    else:
        id_cols_from_meta = ["SUBJECT_ID", "HADM_ID", "ICUSTAY_ID"]
except Exception:
    id_cols_from_meta = ["SUBJECT_ID", "HADM_ID", "ICUSTAY_ID"]

# ====== FEATURE SELECTION (numeric-only, robust fallback if no meta) ======
EXCLUDE_BASE = {
    "_pseudo_label", "_pseudo_label_str",
    "_cluster", "_cluster_median_dose",
    "dose_val_rx", "dose", "dosage", "dose_val", "abx_dose"
}
EXCLUDE_BASE |= set([c for c in id_cols_from_meta if c in pseudo_df.columns])

if feat_cols_from_meta:
    # use meta-driven features (safer when predicting on new data)
    missing = [c for c in feat_cols_from_meta if c not in pseudo_df.columns]
    if missing:
        raise ValueError(f"Pseudo CSV missing expected feature columns from meta: {missing[:15]} ...")
    feature_cols = feat_cols_from_meta
else:
    # infer: all numeric columns except excluded
    numeric_cols = list(pseudo_df.select_dtypes(include=[np.number]).columns)
    feature_cols = [c for c in numeric_cols if c not in EXCLUDE_BASE]

if not feature_cols:
    raise ValueError("No feature columns found to train on. Check your pseudolabel CSV.")

# ====== PREPROCESS + TRAIN ======
X_all = pseudo_df[feature_cols].copy()
y_all = pseudo_df["_pseudo_label"].astype(int).values

preprocess = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])
X_all_proc = preprocess.fit_transform(X_all)

X_tr, X_te, y_tr, y_te = train_test_split(
    X_all_proc, y_all, test_size=0.20, random_state=42, stratify=y_all
)

clf = LogisticRegression(
    max_iter=1000, class_weight="balanced", solver="lbfgs", random_state=42
)
clf.fit(X_tr, y_tr)

# quick validation (vs pseudolabels)
y_pred = clf.predict(X_te)
print("=== Validation vs pseudolabels (held-out) ===")
print("Accuracy:", round(accuracy_score(y_te, y_pred), 3))
print("F1:", round(f1_score(y_te, y_pred), 3))
print("Confusion:\n", confusion_matrix(y_te, y_pred))
print(classification_report(y_te, y_pred, digits=3))

# ====== PREDICT TARGET TABLE ======
if PREDICT_ON:
    target_df = pd.read_csv(PREDICT_ON, low_memory=False)
    # ensure feature set exists
    miss2 = [c for c in feature_cols if c not in target_df.columns]
    if miss2:
        raise ValueError(f"'predict_on' CSV missing feature columns: {miss2[:15]} ...")
    X_pred_df = target_df[feature_cols].copy()
    id_cols_present = [c for c in id_cols_from_meta if c in target_df.columns]
else:
    target_df = pseudo_df
    X_pred_df = target_df[feature_cols].copy()
    id_cols_present = [c for c in id_cols_from_meta if c in target_df.columns]

X_pred = preprocess.transform(X_pred_df)
yhat = clf.predict(X_pred)
proba_inc = clf.predict_proba(X_pred)[:, 1]

# ====== WRITE OUTPUT ======
out = target_df[id_cols_present].copy() if id_cols_present else pd.DataFrame(index=target_df.index)
out["final_label"] = np.where(yhat == 1, "increase", "decrease")
out["final_label_id"] = yhat
out["final_label_prob_increase"] = proba_inc

out_path = Path(OUTPUT_CSV)
out_path.parent.mkdir(parents=True, exist_ok=True)
out.to_csv(out_path, index=False)
print(f"[OK] Wrote predictions to: {out_path.resolve()}")

# ====== SAVE MODEL BUNDLE (optional) ======
joblib.dump({
    "preprocess": preprocess,
    "clf": clf,
    "feature_cols": feature_cols,
    "id_cols": id_cols_from_meta
}, MODEL_PATH)
print(f"[OK] Saved model bundle: {Path(MODEL_PATH).resolve()}")


=== Validation vs pseudolabels (held-out) ===
Accuracy: 0.97
F1: 0.956
Confusion:
 [[2104   40]
 [  59 1081]]
              precision    recall  f1-score   support

           0      0.973     0.981     0.977      2144
           1      0.964     0.948     0.956      1140

    accuracy                          0.970      3284
   macro avg      0.969     0.965     0.967      3284
weighted avg      0.970     0.970     0.970      3284

[OK] Wrote predictions to: C:\Users\VishalMaurya\Downloads\BTP\model\3rd Attempt\final_labels_supervised.csv
[OK] Saved model bundle: C:\Users\VishalMaurya\Downloads\BTP\model\3rd Attempt\antibiotic_direction_pipeline.joblib
