In [14]:
#!/usr/bin/env python3
# ------------------------------------------------------------
# Logistic Regression on the *_full.csv splits
# ------------------------------------------------------------
import json, joblib, numpy as np, pandas as pd
from pathlib import Path
from scipy import sparse
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score

# ------------------------------------------------------------------#
# 0. paths                                                           #
# ------------------------------------------------------------------#
AUG_DIR = Path("/data1/home/srinivasana/peds_agents/agents/notebooks/outputs/splits_with_feats")

df_train = pd.read_csv(AUG_DIR / "train_full.csv", dtype=str, na_filter=False)
df_test  = pd.read_csv(AUG_DIR / "test_full.csv",  dtype=str, na_filter=False)

# ------------------------------------------------------------------#
# 1. label → int mapping                                             #
# ------------------------------------------------------------------#
label2id = {"NotExtrapolated": 0, "Partial": 1, "Full": 2, "Unlabeled": 3}

for d in (df_train, df_test):
    d["label"] = (d["label"]
                  .str.strip()
                  .map(label2id))           # now int 0-3

# ------------------------------------------------------------------#
# 2. column groups                                                   #
# ------------------------------------------------------------------#
num_cols = [
    "total_studies","age_min","age_max",
    "Number of Centers","Number of Countries",
    "Patients Enrolled","Patients Analyzed",
    "Total # of Hispanic/Latino","Total # of Non-Hispanic/Non-Latino",
    "Total #  of Unknown Ethnicity","Total #  of Asian",
    "Total #  of Black","Total #  of White",
]

bool_cols = [
    'Efficacy','Safety','Pharmacokinetic','Pharmacodynamic',
    'Tolerability','Other_Type',
    'Randomized_DoubleBlind','Randomized_SingleBlind','Open_Label',
    'Placebo_Control','Active_Comparator','Dose_Escalation',
    'Population_PK','Other_Design',
    'Studied in Neonates','Indicated in Neonates',
]

cat_cols = [
    'Type of Legislation','Therapeutic Category',
    'Dosage Form(s)','Route(s) of Administration',
]

# ------------------------------------------------------------------#
# 3. one-hot categoricals                                            #
# ------------------------------------------------------------------#
train_cat = pd.get_dummies(df_train[cat_cols], prefix=cat_cols, dtype=np.int8)
test_cat  = pd.get_dummies(df_test [cat_cols], prefix=cat_cols, dtype=np.int8)

# lock template from train, re-index others
cat_columns = train_cat.columns
test_cat    = test_cat.reindex(columns=cat_columns, fill_value=0)

# ------------------------------------------------------------------#
# 4. boolean matrices (add missing columns as 0)                     #
# ------------------------------------------------------------------#
def bool_matrix(df):
    missing = [c for c in bool_cols if c not in df]
    if missing:
        df[missing] = "0"
    return df[bool_cols].astype(int).values

train_bool = bool_matrix(df_train)
test_bool  = bool_matrix(df_test)

# ------------------------------------------------------------------#
# 5. numeric block – blanks→NaN→0 → scale                           #
# ------------------------------------------------------------------#
scaler = StandardScaler()

X_train_num = scaler.fit_transform(
                 df_train[num_cols].replace("", np.nan)
                                     .astype(float)
                                     .fillna(0))
X_test_num  = scaler.transform(
                 df_test[num_cols].replace("", np.nan)
                                   .astype(float)
                                   .fillna(0))

to_csr = lambda a: sparse.csr_matrix(a)

# ------------------------------------------------------------------#
# 6. final sparse matrices                                          #
# ------------------------------------------------------------------#
X_train = sparse.hstack([
    to_csr(X_train_num),
    to_csr(train_bool),
    to_csr(train_cat.values)
], format="csr")

X_test = sparse.hstack([
    to_csr(X_test_num),
    to_csr(test_bool),
    to_csr(test_cat.values)
], format="csr")

y_train = df_train["label"].astype(int).values
y_test  = df_test ["label"].astype(int).values

print("feature dims → train:", X_train.shape, " test:", X_test.shape)

# ------------------------------------------------------------------#
# 7. train Logistic Regression                                       #
# ------------------------------------------------------------------#
clf = LogisticRegression(
        max_iter=3000,
        solver="lbfgs",
        multi_class="multinomial",
        class_weight="balanced",
)
clf.fit(X_train, y_train)

# ------------------------------------------------------------------#
# 8. evaluation                                                     #
# ------------------------------------------------------------------#
print("\n-- TRAIN --")
print(classification_report(y_train, clf.predict(X_train), digits=3))

print("\n-- TEST --")
print(classification_report(y_test,  clf.predict(X_test),  digits=3))

macro_f1 = f1_score(y_test, clf.predict(X_test), average="macro")
acc      = accuracy_score(y_test, clf.predict(X_test))
print(f"\nTEST  macro-F1 = {macro_f1:.3f}   accuracy = {acc:.3f}")

# ------------------------------------------------------------------#
# 9. save artefacts                                                 #
# ------------------------------------------------------------------#
joblib.dump({
    "model": clf,
    "scaler": scaler,
    "cat_columns": cat_columns.tolist(),
    "bool_columns": bool_cols,
    "num_columns": num_cols,
}, AUG_DIR / "logreg_artifact.joblib")

with open(AUG_DIR / "logreg_test_metrics.json", "w") as fh:
    json.dump({"macro_f1": float(macro_f1),
               "accuracy": float(acc)}, fh, indent=2)

print("\n✓ Logistic Regression pipeline complete.")


feature dims → train: (687, 253)  test: (34, 253)

-- TRAIN --
              precision    recall  f1-score   support

           0      0.926     0.657     0.769       420
           1      0.651     0.798     0.717       208
           2      0.400     1.000     0.571        12
           3      0.385     0.851     0.530        47

    accuracy                          0.719       687
   macro avg      0.590     0.827     0.647       687
weighted avg      0.797     0.719     0.733       687


-- TEST --
              precision    recall  f1-score   support

           0      0.833     0.714     0.769        21
           1      0.636     0.700     0.667        10
           2      0.000     0.000     0.000         1
           3      0.000     0.000     0.000         2

    accuracy                          0.647        34
   macro avg      0.367     0.354     0.359        34
weighted avg      0.702     0.647     0.671        34


TEST  macro-F1 = 0.359   accuracy = 0.647

✓ Logistic 



In [None]:
#!/usr/bin/env python3
# logreg_tuned.py  –  logistic regression with dev-set tuning
# ------------------------------------------------------------
import json, joblib, numpy as np, pandas as pd
from pathlib import Path
from scipy import sparse
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, accuracy_score, make_scorer
from sklearn.pipeline import Pipeline

# ------------------------------------------------------------------#
# 0.  paths & data                                                   #
# ------------------------------------------------------------------#
AUG_DIR = Path("/data1/home/srinivasana/peds_agents/agents/notebooks/outputs/splits_with_feats")
df_train = pd.read_csv(AUG_DIR / "train_full.csv", dtype=str, na_filter=False)
df_dev   = pd.read_csv(AUG_DIR / "dev_full.csv",   dtype=str, na_filter=False)   # <── NEW
df_test  = pd.read_csv(AUG_DIR / "test_full.csv",  dtype=str, na_filter=False)

# ------------------------------------------------------------------#
# 1.  label → int                                                   #
# ------------------------------------------------------------------#
label2id = {"NotExtrapolated": 0, "Partial": 1, "Full": 2, "Unlabeled": 3}
for d in (df_train, df_dev, df_test):
    d["label"] = d["label"].str.strip().map(label2id)

# ------------------------------------------------------------------#
# 2.  column groups (unchanged)                                     #
# ------------------------------------------------------------------#
num_cols = [
    "total_studies","age_min","age_max",
    "Number of Centers","Number of Countries",
    "Patients Enrolled","Patients Analyzed",
    "Total # of Hispanic/Latino","Total # of Non-Hispanic/Non-Latino",
    "Total #  of Unknown Ethnicity","Total #  of Asian",
    "Total #  of Black","Total #  of White",
]

bool_cols = [
    'Efficacy','Safety','Pharmacokinetic','Pharmacodynamic',
    'Tolerability','Other_Type',
    'Randomized_DoubleBlind','Randomized_SingleBlind','Open_Label',
    'Placebo_Control','Active_Comparator','Dose_Escalation',
    'Population_PK','Other_Design',
    'Studied in Neonates','Indicated in Neonates',
]

cat_cols = [
    'Type of Legislation','Therapeutic Category',
    'Dosage Form(s)','Route(s) of Administration',
]

# ------------------------------------------------------------------#
# 3.  helper fns (unchanged)                                        #
# ------------------------------------------------------------------#
def one_hot(train, *others):
    tr = pd.get_dummies(train[cat_cols], prefix=cat_cols, dtype=np.int8)
    outs = [pd.get_dummies(o[cat_cols], prefix=cat_cols, dtype=np.int8)
            .reindex(columns=tr.columns, fill_value=0) for o in others]
    return (tr, *outs)

def bool_mat(df):
    miss = [c for c in bool_cols if c not in df]
    if miss: df[miss] = "0"
    return df[bool_cols].astype(int).values

def num_mat(df, scaler=None, fit=False):
    m = df[num_cols].replace("", np.nan).astype(float).fillna(0)
    if fit:
        scaler.fit(m)
    return scaler.transform(m)

to_csr = lambda a: sparse.csr_matrix(a)

# Fit scaler on **train only** ------------------------------------
scaler = StandardScaler()
Xtr_num = num_mat(df_train, scaler, fit=True)
Xdv_num = num_mat(df_dev,   scaler)
Xte_num = num_mat(df_test,  scaler)

# Booleans & categoricals -----------------------------------------
Xtr_bool, Xdv_bool, Xte_bool = map(bool_mat, (df_train, df_dev, df_test))
Xtr_cat , Xdv_cat , Xte_cat  = one_hot(df_train, df_dev, df_test)

# Stack ------------------------------------------------------------
def stack(num, boo, cat):
    return sparse.hstack([to_csr(num), to_csr(boo), to_csr(cat)], format="csr")

X_train = stack(Xtr_num, Xtr_bool, Xtr_cat.values)
X_dev   = stack(Xdv_num, Xdv_bool, Xdv_cat.values)
X_test  = stack(Xte_num, Xte_bool, Xte_cat.values)

y_train = df_train["label"].values
y_dev   = df_dev  ["label"].values
y_test  = df_test ["label"].values

print("dims  train:", X_train.shape, "dev:", X_dev.shape, "test:", X_test.shape)

# ------------------------------------------------------------------#
# 4.  hyper-parameter search on the dev set                         #
# ------------------------------------------------------------------#
param_grid = {
    "C":        [0.01, 0.1, 1, 3, 10],
    "penalty":  ["l2", "l1", "elasticnet"],
    "l1_ratio": [None, 0.5],       # None ignored unless penalty="elasticnet"
}

# Custom scorer — macro-F1
f1_macro = make_scorer(f1_score, average="macro")

logreg = LogisticRegression(max_iter=5000,
                            solver="saga",            # supports l1/elastic
                            multi_class="multinomial",
                            class_weight="balanced")

search = GridSearchCV(logreg,
                      param_grid,
                      scoring=f1_macro,
                      cv=[(np.arange(len(y_train)),               # train idx (all)
                           np.arange(len(y_train), len(y_train))) # dummy (unused)
                         ],                                       # 1-fold CV: we’ll
                      refit=True,                                 # just score on dev
                      verbose=0)

# We cheat slightly: fit on train, score on dev manually -----------
search.fit(X_train, y_train)
best_clf = search.best_estimator_

print(f"\nBest params: {search.best_params_}  (dev macro-F1 = {search.best_score_:.3f})")

# ------------------------------------------------------------------#
# 5.  Evaluate on TEST                                             #
# ------------------------------------------------------------------#
print("\n-- TEST --")
print(classification_report(y_test, best_clf.predict(X_test), digits=3))

macro_f1 = f1_score(y_test,  best_clf.predict(X_test), average="macro")
acc      = accuracy_score(y_test, best_clf.predict(X_test))
print(f"\nTEST macro-F1 = {macro_f1:.3f}   accuracy = {acc:.3f}")

# ------------------------------------------------------------------#
# 6.  save artefacts                                               #
# ------------------------------------------------------------------#
joblib.dump({
    "model": best_clf,
    "scaler": scaler,
    "cat_columns": Xtr_cat.columns.tolist(),
    "bool_columns": bool_cols,
    "num_columns":  num_cols,
}, AUG_DIR / "logreg_tuned.joblib")

with open(AUG_DIR / "logreg_test_metrics.json", "w") as fh:
    json.dump({"macro_f1": float(macro_f1),
               "accuracy":  float(acc)}, fh, indent=2)

print("\n✓ Logistic Regression (tuned) complete.")


dims  train: (687, 253) dev: (16, 253) test: (34, 253)


Traceback (most recent call last):
  File "/data1/home/srinivasana/peds_agents/peds-agents-venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/data1/home/srinivasana/peds_agents/peds-agents-venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/data1/home/srinivasana/peds_agents/peds-agents-venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
  File "/data1/home/srinivasana/peds_agents/peds-agents-venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
  File "/data1/home/srinivasana/peds_agents/peds-agents-venv/lib/python3.10/site-packages/sklearn/utils/_response.py", line 214, in _get_response_values
    y_pred = prediction


Best params: {'C': 0.01, 'l1_ratio': None, 'penalty': 'l2'}  (dev macro-F1 = nan)

-- TEST --
              precision    recall  f1-score   support

           0      0.727     0.381     0.500        21
           1      0.545     0.600     0.571        10
           2      0.000     0.000     0.000         1
           3      0.000     0.000     0.000         2

    accuracy                          0.412        34
   macro avg      0.318     0.245     0.268        34
weighted avg      0.610     0.412     0.477        34


TEST macro-F1 = 0.268   accuracy = 0.412

✓ Logistic Regression (tuned) complete.


Traceback (most recent call last):
  File "/data1/home/srinivasana/peds_agents/peds-agents-venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 949, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/data1/home/srinivasana/peds_agents/peds-agents-venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 288, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/data1/home/srinivasana/peds_agents/peds-agents-venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 380, in _score
    y_pred = method_caller(
  File "/data1/home/srinivasana/peds_agents/peds-agents-venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
  File "/data1/home/srinivasana/peds_agents/peds-agents-venv/lib/python3.10/site-packages/sklearn/utils/_response.py", line 214, in _get_response_values
    y_pred = prediction

In [17]:
#!/usr/bin/env python3
# logistic_regression.py  – tuned LR on *_full.csv splits
# -------------------------------------------------------

import json, joblib, numpy as np, pandas as pd
from pathlib import Path
from itertools import product
from scipy import sparse
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, classification_report

# ───────────────────────── 0. paths & data ─────────────────────────
AUG_DIR   = Path("/data1/home/srinivasana/peds_agents/agents/notebooks/outputs/splits_with_feats")
df_train  = pd.read_csv(AUG_DIR / "train_full.csv", dtype=str, na_filter=False)
df_dev    = pd.read_csv(AUG_DIR / "dev_full.csv",   dtype=str, na_filter=False)
df_test   = pd.read_csv(AUG_DIR / "test_full.csv",  dtype=str, na_filter=False)

label2id = {"NotExtrapolated": 0, "Partial": 1, "Full": 2, "Unlabeled": 3}
for d in (df_train, df_dev, df_test):
    d["label"] = d["label"].str.strip().map(label2id)

# ───────────────────────── 1. column groups ────────────────────────
num_cols  = [
    "total_studies","age_min","age_max",
    "Number of Centers","Number of Countries",
    "Patients Enrolled","Patients Analyzed",
    "Total # of Hispanic/Latino","Total # of Non-Hispanic/Non-Latino",
    "Total #  of Unknown Ethnicity","Total #  of Asian",
    "Total #  of Black","Total #  of White",
]
bool_cols = [
    'Efficacy','Safety','Pharmacokinetic','Pharmacodynamic',
    'Tolerability','Other_Type',
    'Randomized_DoubleBlind','Randomized_SingleBlind','Open_Label',
    'Placebo_Control','Active_Comparator','Dose_Escalation',
    'Population_PK','Other_Design',
    'Studied in Neonates','Indicated in Neonates',
]
cat_cols  = [
    'Type of Legislation','Therapeutic Category',
    'Dosage Form(s)','Route(s) of Administration',
]

# ───────────────────────── 2. helpers ──────────────────────────────
to_csr = lambda a: sparse.csr_matrix(a)

def one_hot(train, *others):
    tr  = pd.get_dummies(train[cat_cols], prefix=cat_cols, dtype=np.int8)
    res = [pd.get_dummies(o[cat_cols], prefix=cat_cols, dtype=np.int8)
             .reindex(columns=tr.columns, fill_value=0) for o in others]
    return (tr, *res)

def bool_mat(df):
    miss = [c for c in bool_cols if c not in df]
    if miss: df[miss] = "0"
    return df[bool_cols].astype(int).values

def num_mat(df, scaler=None, fit=False):
    mat = df[num_cols].replace("", np.nan).astype(float).fillna(0)
    if fit: scaler.fit(mat)
    return scaler.transform(mat)

# ───────────────────────── 3. build matrices ───────────────────────
scaler     = StandardScaler()
Xtr_num    = num_mat(df_train, scaler, fit=True)
Xdv_num    = num_mat(df_dev,   scaler)
Xte_num    = num_mat(df_test,  scaler)

Xtr_bool, Xdv_bool, Xte_bool  = map(bool_mat, (df_train, df_dev, df_test))
Xtr_cat , Xdv_cat , Xte_cat   = one_hot(df_train, df_dev, df_test)

def stack(num, boo, cat):
    return sparse.hstack([to_csr(num), to_csr(boo), to_csr(cat)], format="csr")

X_train = stack(Xtr_num, Xtr_bool, Xtr_cat.values)
X_dev   = stack(Xdv_num, Xdv_bool, Xdv_cat.values)
X_test  = stack(Xte_num, Xte_bool, Xte_cat.values)

y_train = df_train["label"].astype(int).values
y_dev   = df_dev  ["label"].astype(int).values
y_test  = df_test ["label"].astype(int).values

print("dims  train:", X_train.shape, "dev:", X_dev.shape, "test:", X_test.shape)

# ───────────────────────── 4. hyper-param search (train → dev) ─────
grid = {
    "C":        [0.01, 0.1, 1, 3, 10],
    "penalty":  ["l2", "l1"],
    "l1_ratio": [None, 0.5],     # ignored unless penalty='elasticnet'
}

best_f1, best_params, best_clf = -1, None, None
for C, pen, l1r in product(grid["C"], grid["penalty"], grid["l1_ratio"]):
    if pen != "elasticnet" and l1r is not None:
        continue
    clf = LogisticRegression(
            max_iter     = 5000,
            solver       = "saga",
            multi_class  = "multinomial",
            class_weight = "balanced",
            C=C, penalty=pen, l1_ratio=l1r)
    clf.fit(X_train, y_train)
    f1 = f1_score(y_dev, clf.predict(X_dev), average="macro", zero_division=0)
    if f1 > best_f1:
        best_f1, best_params, best_clf = f1, (C, pen, l1r), clf

print(f"✓ best on DEV  macro-F1={best_f1:.3f}  "
      f"C={best_params[0]}  penalty={best_params[1]}  l1_ratio={best_params[2]}")

# ───────────────────────── 5. refit on TRAIN+DEV  ───────────────────
X_full = sparse.vstack([X_train, X_dev])
y_full = np.concatenate([y_train, y_dev])

final_clf = LogisticRegression(
        max_iter     = 5000,
        solver       = "saga",
        multi_class  = "multinomial",
        class_weight = "balanced",
        C            = best_params[0],
        penalty      = best_params[1],
        l1_ratio     = best_params[2],
)
final_clf.fit(X_full, y_full)

# ───────────────────────── 6. test evaluation ───────────────────────
print("\n-- TEST --")
print(classification_report(y_test, final_clf.predict(X_test),
                            digits=3, zero_division=0))

macro_f1 = f1_score(y_test, final_clf.predict(X_test),
                    average="macro", zero_division=0)
acc      = accuracy_score(y_test, final_clf.predict(X_test))
print(f"\nTEST macro-F1 = {macro_f1:.3f}   accuracy = {acc:.3f}")

# ───────────────────────── 7. save artefacts ────────────────────────
joblib.dump({
    "model":       final_clf,
    "scaler":      scaler,
    "cat_columns": Xtr_cat.columns.tolist(),
    "bool_columns": bool_cols,
    "num_columns":  num_cols,
}, AUG_DIR / "logreg_tuned.joblib")

with open(AUG_DIR / "logreg_test_metrics.json", "w") as fh:
    json.dump({"macro_f1": float(macro_f1),
               "accuracy":  float(acc)}, fh, indent=2)

print("\n✓ Logistic Regression pipeline complete.")


dims  train: (687, 253) dev: (16, 253) test: (34, 253)




ValueError: l1_ratio must be specified when penalty is elasticnet.