# 14 LLM Extensions

## 0 Imports and Setup

In [1]:
from openai import OpenAI
import os
import pandas as pd
import numpy as np
from pathlib import Path

from src.utils import resolve_path

from src.data_prep import split_data

from src.features import (
    scale_features,
    scale_llm_features,
    merge_embeddings_with_features,
    save_feature_dataset
)

from src.resampling import resample_training_data, print_class_balance, save_resampled_dataset

from src.models import (
    get_classifiers,
    get_param_distributions,
    get_n_iter_random_per_clf,
    repeated_cv_with_mixed_search,
    auc_scorer
)

from src.evaluation import export_summary

from src.llm_long_context_summary import LongContextSummaryConfig

from src.llm_embeddings import (
    EmbeddingConfig,
    _call_embedding_provider,
    prepare_embedding_inputs,
    run_llm_embeddings_pipeline,
    load_embeddings_artifact,
    load_embedding_cache,
    validate_cache_schema,
    run_long_context_embeddings_pipeline
)

from src.llm_structured_features import (
    smoke_test_structured_extraction,
    ExtractionConfig,
    extract_features_with_retries,
    SCHEMA_VERSION,
    SEPSIS_MORTALITY_MODEL_SCHEMA,
    DEFAULT_FEATURES_ARTIFACT_PATH,
    TruncationPolicy,
    prepare_feature_inputs,
    run_llm_structured_features_pipeline
)

import src.llm_embeddings as llmA

import src.llm_structured_features as llmB

In [None]:
# OPTIONAL (cost tracking hygiene):
# 1) Create a dedicated OpenAI API key for this embedding run (Dashboard ‚Üí API keys).
# 2) Set it as OPENAI_API_KEY (env var) before launching Jupyter.
# 3) Run the embedding build once.
# 4) Revoke/delete the key after the run to prevent accidental future spend.

In [2]:
import os
assert os.environ.get("OPENAI_API_KEY"), "OPENAI_API_KEY not found in environment"

### Smoke Tests

In [None]:
client = OpenAI()

resp = client.embeddings.create(
    model="text-embedding-3-small",
    input=["test sepsis note"],
    dimensions=256
)

len(resp.data[0].embedding)


In [None]:
cfg_smoke = EmbeddingConfig(model="text-embedding-3-small", dimensions=256, batch_size=8)
arr = _call_embedding_provider(["hello world", "sepsis mortality risk"], cfg_smoke)

print(arr.shape)          # (2, 256)
print(np.isfinite(arr).all())


In [None]:
# Test Structured feature extraction

smoke_test_structured_extraction()

## 1 Load Data

In [3]:
nlp_ready_path = resolve_path("data/interim/data_nlp_ready.csv")
nlp_ready_df = pd.read_csv(nlp_ready_path)

print(f"‚úÖ Loaded NLP-ready dataset: {nlp_ready_df.shape}")
print(f"Columns: {nlp_ready_df.columns.tolist()[:10]} ...")

‚úÖ Loaded NLP-ready dataset: (5208, 51)
Columns: ['subject_id', 'hospital_expire_flag', 'max_age', 'los_icu', 'first_hosp_stay', 'suspected_infection', 'sofa_score', 'sepsis3', 'avg_urineoutput', 'glucose_min'] ...


In [4]:
n_missing = nlp_ready_df["Radiology_notes"].isna().sum()
print(f"‚ÑπÔ∏è Radiology_notes missing for {n_missing} patients (kept intentionally)")

‚ÑπÔ∏è Radiology_notes missing for 2 patients (kept intentionally)


In [5]:
# Set aggregated Radiology notes (could replace this with discharge later)
TEXT_COL = "Radiology_notes"

In [6]:
# Drop note text columns
original_df = nlp_ready_df.drop(
    columns=["Radiology_notes", "Discharge_summary_notes", "combined_notes"]
)

X_original = original_df.drop(columns=[
    'hospital_expire_flag',
    'first_hosp_stay',
    'suspected_infection',
    'sepsis3'])
y_original = original_df["hospital_expire_flag"]


print(f"‚úÖ Structured features: {X_original.shape}, Target: {y_original.shape}")

X_train_orig, X_test_orig, y_train_orig, y_test_orig = split_data(X_original, y_original, test_size=0.2, random_state=42)
print(f"Train: {X_train_orig.shape}, Test: {X_test_orig.shape}")

X_train_orig_scaled, X_test_orig_scaled, y_train_orig, y_test_orig = scale_features(
    X_train_orig, X_test_orig, y_train_orig, y_test_orig, prefix="original"
)

train_ids = set(X_train_orig["subject_id"])
test_ids  = set(X_test_orig["subject_id"])

# Align note text subsets to train/test subjects
train_notes = nlp_ready_df.loc[nlp_ready_df["subject_id"].isin(train_ids)].copy()
test_notes  = nlp_ready_df.loc[nlp_ready_df["subject_id"].isin(test_ids)].copy()

‚úÖ Structured features: (5208, 44), Target: (5208,)
Train: (4166, 44), Test: (1042, 44)
‚úÖ Scaled original features prepared (not saved ‚Äî handled downstream)


## 2 Generate and Cache Dense Embeddings for All Patients - LLM Extension A

In [None]:
summary_cfg = LongContextSummaryConfig(
    model="gpt-4.1-mini",
    max_output_tokens=3500,
    temperature=0.0,
)

emb_cfg = EmbeddingConfig(
    model="text-embedding-3-small",
    dimensions=256,
    batch_size=64,
    use_batch_api=True,
)

df_emb = run_long_context_embeddings_pipeline(
    nlp_ready_df=nlp_ready_df,
    raw_text_col=TEXT_COL,  # Radiology_notes aggregated text column
    id_col="subject_id",
    summary_cfg=summary_cfg,
    embedding_cfg=emb_cfg,
    summary_artifact_path="data/processed/llm_A/long_context_summaries.parquet",
    embeddings_artifact_path="data/processed/llm_A/embeddings_all.parquet",
    source_path="data/interim/data_nlp_ready.csv",
    verbose=True,
    
)


‚öôÔ∏è long-context summary artifact missing/invalid ‚Äî generating summaries.
üîé Long-context summary cache loaded: 0 rows (model=gpt-4.1-mini)
üß© Summaries needed: 5208 / 5208
‚û°Ô∏è  Summarizing: 0 / 5208


In [None]:
from openai import OpenAI
client = OpenAI()

batch_id = "batch_69580b0645bc8190a96da9ec1428fde1"
b = client.batches.retrieve(batch_id)

print("status:", b.status)
print("errors:", b.errors)
print("error_file_id:", b.error_file_id)
print("input_file_id:", b.input_file_id)


## 3 Validate Dense Embeddings and Caches

In [None]:
df_cache = load_embedding_cache(cfg)
validate_cache_schema(df_cache, cfg)

df_art = load_embeddings_artifact("data/processed/llm_A/embeddings_all.parquet")

print("cache rows:", df_cache.shape[0])
print("artifact rows:", df_art.shape[0])

assert df_art.shape[0] == df_inputs.shape[0]
assert df_art.shape[0] <= df_cache.shape[0]  # cache may contain more if you reran with different configs


In [None]:
# Confirm no missing Embeddings
emb_cols = [c for c in df_art.columns if c.startswith("emb_")]
assert not df_art[emb_cols].isna().any().any()


## 4 LLM Structured Feature Extraction

### 4.1 Mortality-Focused Schema

**Schema version (model-scoped):**  
`sepsis_mortality_gpt_4o_mini_v1`

This schema is designed to extract mortality-relevant clinical signals from radiology notes for patients with sepsis.  

---

### Infection Context

**suspected_infection_source** (categorical)  
Allowed values:
- respiratory  
- urinary  
- intra_abdominal  
- skin_soft_tissue  
- line_catheter  
- cns  
- other  
- unknown  

**suspected_pathogen_type** (categorical)  
Allowed values:
- bacterial  
- viral  
- fungal  
- mixed  
- unknown  

---

### SOFA-Aligned Organ Dysfunction Signals  
(Indicators associated with severity and mortality)

#### Respiratory (SOFA respiratory)
- resp_failure_present (0 / 1)  
- mechanical_ventilation_present (0 / 1)  
- hypoxemia_present (0 / 1)  

#### Cardiovascular (SOFA CV)
- hypotension_present (0 / 1)  
- vasopressor_use_present (0 / 1)  

#### Renal (SOFA renal)
- aki_present (0 / 1)  
- oliguria_anuria_present (0 / 1)  
- dialysis_present (0 / 1)  

#### Coagulation (SOFA coag)
- thrombocytopenia_present (0 / 1)  

#### Liver (SOFA liver)
- hyperbilirubinemia_present (0 / 1)  

#### Central Nervous System (SOFA CNS)
- altered_mentation_present (0 / 1)  

---

### Mortality Modifiers

**high_risk_course_language_present** (0 / 1)  
Language suggesting clinical deterioration or poor prognosis, e.g.:
- ‚Äúworsening‚Äù
- ‚Äúcritical‚Äù
- ‚Äúguarded‚Äù
- ‚Äúpoor prognosis‚Äù
- ‚Äúmultiorgan failure‚Äù
- ‚Äúrapid decline‚Äù

**limitation_of_care_present** (0 / 1)  
Language indicating limitations or withdrawal of care, e.g.:
- DNR / DNI
- comfort measures only
- hospice
- withdrawal of life-sustaining treatment

---

### Imaging-Level Severity Impression

**imaging_severity_impression** (categorical)  
Overall severity language in the radiology impression:
- mild  
- moderate  
- severe  
- unknown 


In [None]:
# ==================================================
# Feature Definitions
# ==================================================

# ------------------------------
# Infection context
# ------------------------------

# suspected_infection_source: categorical
# Allowed values:
#   - respiratory
#   - urinary
#   - intra_abdominal
#   - skin_soft_tissue
#   - line_catheter
#   - cns
#   - other
#   - unknown

# suspected_pathogen_type: categorical
# Allowed values:
#   - bacterial
#   - viral
#   - fungal
#   - mixed
#   - unknown


# ------------------------------
# SOFA-aligned organ dysfunction
# (mortality-relevant signals)
# ------------------------------

# Respiratory (SOFA respiratory)
# resp_failure_present: 0 / 1
# mechanical_ventilation_present: 0 / 1
# hypoxemia_present: 0 / 1

# Cardiovascular (SOFA CV)
# hypotension_present: 0 / 1
# vasopressor_use_present: 0 / 1

# Renal (SOFA renal)
# aki_present: 0 / 1
# oliguria_anuria_present: 0 / 1
# dialysis_present: 0 / 1

# Coagulation (SOFA coag)
# thrombocytopenia_present: 0 / 1

# Liver (SOFA liver)
# hyperbilirubinemia_present: 0 / 1

# CNS (SOFA CNS)
# altered_mentation_present: 0 / 1


# ------------------------------
# Mortality modifiers
# ------------------------------

# high_risk_course_language_present: 0 / 1
# Indicates language suggesting deterioration or poor prognosis, e.g.:
#   "worsening", "critical", "guarded", "poor prognosis",
#   "multiorgan failure", "rapid decline", etc.

# limitation_of_care_present: 0 / 1
# Indicates limitations or withdrawal of care, e.g.:
#   DNR, DNI, comfort measures only, hospice, withdrawal of life support


# ------------------------------
# Imaging-level severity
# ------------------------------

# imaging_severity_impression: categorical
# Allowed values:
#   - mild
#   - moderate
#   - severe
#   - unknown
# Based on overall severity phrasing in the radiology impression

### 4.2 Truncation Procedure

#### NOTE
Patient with subject id 15114531 had a token length that exceeded the max token length and had to be handled separately with token truncation.

In [None]:
note_text = nlp_ready_df.loc[nlp_ready_df["subject_id"] == 15114531, TEXT_COL].iloc[0]
mid = len(note_text) // 2
q1 = mid - (mid // 2)
q3 = mid + (mid // 2)
note_part_1 = note_text[:q1]
note_part_2 = note_text[q1:mid]
note_part_3 = note_text[mid:q3]
note_part_4 = note_text[q3:]

In [None]:
policy = TruncationPolicy(
    max_tokens=125_000,
    apply_to_subject_ids={15114531},  # only this patient can be truncated
)

### 4.3 Run Orchestrator

In [None]:
cfg_extract = ExtractionConfig(model="gpt-4o-mini")

df_inputs_B = prepare_feature_inputs(
    nlp_ready_df,
    text_col=TEXT_COL,
    id_col="subject_id",
)

df_llmB = run_llm_structured_features_pipeline(
    df_inputs=df_inputs_B,
    cfg=cfg_extract,
    model_schema=SEPSIS_MORTALITY_MODEL_SCHEMA,
    artifact_path=DEFAULT_FEATURES_ARTIFACT_PATH,
    prefix="llmB_",
    source_path="data/interim/data_nlp_ready.csv",
    verbose=True,
    batch_size=100,
    checkpoint_every=100,
    resume=True,
    retry_errors=False, # <- set to True if failure occurs
    #truncation_policy=policy # <- set := policy if failure occurs
)
print((resolve_path(DEFAULT_FEATURES_ARTIFACT_PATH).parent / "features_manifest.json").exists())

### 4.4 Check Dataframe

In [None]:
import pandas as pd
from src.llm_structured_features import ExtractionConfig, load_feature_cache

cfg = ExtractionConfig(model="gpt-4o-mini")

df_cache = load_feature_cache(cfg)

df_cache

## 5 Split, Standard Scale, Merge, and Save Splits for LLM Data

### LLM A

### LLM B

In [None]:
# Get Train/Test partitions

llmB_train = df_llmB[df_llmB["subject_id"].isin(train_ids)].copy()
llmB_test  = df_llmB[df_llmB["subject_id"].isin(test_ids)].copy()

assert set(llmB_train["subject_id"]) == train_ids
assert set(llmB_test["subject_id"])  == test_ids
assert llmB_train.shape[0] == X_train_orig.shape[0]
assert llmB_test.shape[0]  == X_test_orig.shape[0]


In [None]:
# Apply Standard Scaling on all LLM Structured Features

llmB_train_scaled, llmB_test_scaled = scale_llm_features(
    X_train=llmB_train,
    X_test=llmB_test,
    model=cfg_extract.model,   # e.g. "gpt-4o-mini"
    prefix="llmB_",
    id_col="subject_id",
)


In [None]:
X_train_llmB_scaled, X_test_llmB_scaled = merge_embeddings_with_features(
    X_train_features=X_train_orig_scaled,
    X_test_features=X_test_orig_scaled,
    X_train_embed=llmB_train_scaled,
    X_test_embed=llmB_test_scaled,
    id_col="subject_id",
    prefix="llm_B",   # choose the variant folder/name you want
    save_dir="data/processed",
)


## 6 Apply and Save SMOTE Resampling

In [None]:
# ============================================================
# LLM variants dataset container (keep subject_id through SMOTE)
# ============================================================

datasets = {
    "llm_B": {
        "X_train": X_train_llmB_scaled.copy(),
        "X_test":  X_test_llmB_scaled.copy(),
        "y_train": y_train_orig.copy(),
        "y_test":  y_test_orig.copy(),
    },

    # ----------------------------
    # LLM A (comment out for now)
    # ----------------------------
    # "llm_A": {
    #     "X_train": X_train_llmA_scaled.copy(),
    #     "X_test":  X_test_llmA_scaled.copy(),
    #     "y_train": y_train_orig.copy(),
    #     "y_test":  y_test_orig.copy(),
    # },
}


In [None]:
# ============================================================
# Apply SMOTE per LLM variant (train only)
# ============================================================

for variant, data in datasets.items():
    X_train_res, y_train_res = resample_training_data(
        data["X_train"],
        data["y_train"],
        method="smote"
    )

    datasets[variant]["X_train_res"] = X_train_res
    datasets[variant]["y_train_res"] = y_train_res

    print_class_balance(y_train_res, f"{variant} training set (after SMOTE)")


# ============================================================
# Save SMOTE-balanced training sets
# ============================================================

for variant, data in datasets.items():
    out_dir = resolve_path(f"data/processed/{variant}")
    os.makedirs(out_dir, exist_ok=True)

    X_train_res = pd.DataFrame(data["X_train_res"])
    y_train_res = pd.Series(data["y_train_res"], name="target")

    X_train_res.to_csv(
        os.path.join(out_dir, f"data_{variant}_xtrain_res.csv"),
        index=False
    )
    y_train_res.to_csv(
        os.path.join(out_dir, f"data_{variant}_ytrain_res.csv"),
        index=False
    )

    print(f"‚úÖ Saved SMOTE-balanced training sets for {variant} under {out_dir}")


### Remove Subject ID from All Sets

In [None]:
# ============================================================
# Remove subject_id and non-feature cols from all X_* datasets (IN PLACE)
# ============================================================

drop_cols = ["subject_id", "first_hosp_stay", "suspected_infection", "sepsis3"]

for variant, data in datasets.items():
    for key in ["X_train", "X_test", "X_train_res"]:
        df = data.get(key)
        if df is None:
            continue

        cols_to_drop = [c for c in drop_cols if c in df.columns]
        if cols_to_drop:
            df.drop(columns=cols_to_drop, inplace=True)

    print(
        f"‚úÖ Dropped id/meta cols for {variant} | "
        f"X_train: {data['X_train'].shape}, "
        f"X_train_res: {data['X_train_res'].shape}, "
        f"X_test: {data['X_test'].shape}"
    )


In [None]:
variants = [
    "llm_B",
    # "llm_A",
]

for variant in variants:
    base_dir = Path(resolve_path(f"data/processed/{variant}"))

    files_to_fix = [
        base_dir / f"data_{variant}_xtrain.csv",
        base_dir / f"data_{variant}_xtest.csv",
        base_dir / f"data_{variant}_xtrain_res.csv",
    ]

    for fp in files_to_fix:
        if not fp.exists():
            print(f"‚ö†Ô∏è Missing file (skipped): {variant}/{fp.name}")
            continue

        df = pd.read_csv(fp)
        cols_present = [c for c in drop_cols if c in df.columns]
        if cols_present:
            df.drop(columns=cols_present, inplace=True)
            df.to_csv(fp, index=False)
            print(f"‚úÖ Rewrote {variant}/{fp.name} (dropped: {cols_present})")
        else:
            print(f"‚ÑπÔ∏è {variant}/{fp.name} already clean")


## 7 Define Define Classifiers & Hyperparameter Distributions

In [None]:
classifiers = get_classifiers()
param_spaces = get_param_distributions()
n_iter_random_per_clf = get_n_iter_random_per_clf()

n_iter_random_subset = {k: n_iter_random_per_clf.get(k, 50) for k in classifiers.keys()}
print("n_iter_random_per_clf:", n_iter_random_subset)

print("‚úÖ Classifiers and hyperparameter grids initialized.")
print("Available classifiers:", list(classifiers.keys()))

## 8 Classifier Re-Training with LLM A Embeddings

In [None]:
# ============================================================
# Run repeated CV with mixed search strategy ‚Äî LLM A (COMMENTED)
# ============================================================

# mode = "llm_A"
# save_prefix = f"results/models/{mode}/"
#
# X_train = X_train_llmA_merged
# X_test  = X_test_llmA_merged
# y_train = y_train_orig
# y_test  = y_test_orig
#
# X_train_res = X_train_llmA_res
# y_train_res = y_train_llmA_res
#
# results_llmA, summary_llmA = repeated_cv_with_mixed_search(
#     X_train,
#     y_train,
#     X_test,
#     y_test,
#     classifiers=classifiers,
#     param_spaces=param_spaces,
#     X_train_smote=X_train_res,
#     y_train_smote=y_train_res,
#     n_splits=5,
#     n_repeats=10,
#     scoring=auc_scorer,
#     n_iter_random=None,
#     n_iter_random_per_clf=n_iter_random_per_clf,
#     save_prefix=save_prefix,
#     mode=mode,
#     log_mlflow=True,
# )
#
# export_summary(summary_llmA, save_prefix="reports/", mode=mode)
# print(f"‚úÖ Finished model training for {mode} dataset.")


## 9 Classifier Re-Training with LLM B Embeddings

In [None]:
# ============================================================
# Run repeated CV with mixed search strategy ‚Äî LLM B
# ============================================================

mode = "llm_B"
save_prefix = f"results/models/{mode}/"

# ============================================================
# Unpack LLM-B datasets for training
# ============================================================

X_train      = datasets["llm_B"]["X_train"]
X_test       = datasets["llm_B"]["X_test"]
y_train      = datasets["llm_B"]["y_train"]
y_test       = datasets["llm_B"]["y_test"]
X_train_res  = datasets["llm_B"]["X_train_res"]
y_train_res  = datasets["llm_B"]["y_train_res"]

results_llmB, summary_llmB = repeated_cv_with_mixed_search(
    X_train,
    y_train,
    X_test,
    y_test,
    classifiers=classifiers,
    param_spaces=param_spaces,
    X_train_smote=X_train_res,
    y_train_smote=y_train_res,
    n_splits=5,
    n_repeats=10,
    scoring=auc_scorer,            # same as Task 4 / 08
    n_iter_random=None,            # keep None if using per-clf dict
    n_iter_random_per_clf=n_iter_random_per_clf,
    save_prefix=save_prefix,
    mode=mode,
    log_mlflow=True,
)

# Export summary to reports/
export_summary(summary_llmB, save_prefix="reports/", mode=mode)
print(f"‚úÖ Finished model training for {mode} dataset.")


## 10 Check Cosine Similarity Between two LLM A Outputs

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def shuffle_lines(text: str, seed: int = 0) -> str:
    rng = np.random.default_rng(seed)
    lines = [ln for ln in str(text).splitlines() if ln.strip() != ""]
    rng.shuffle(lines)
    return "\n".join(lines)

# pick 50 subjects
df50 = nlp_ready_df[["subject_id", TEXT_COL]].sample(50, random_state=7).copy()
df50_shuf = df50.copy()
df50_shuf[TEXT_COL] = df50_shuf[TEXT_COL].apply(lambda t: shuffle_lines(t, seed=7))

# run summaries twice (different artifact paths so caches stay separate)
dfA = run_long_context_embeddings_pipeline(
    nlp_ready_df=df50,
    raw_text_col=TEXT_COL,
    summary_artifact_path="data/processed/llm_A/long_context_summaries_50.parquet",
    embeddings_artifact_path="data/processed/llm_A/embeddings_50.parquet",
    verbose=False,
)

dfB = run_long_context_embeddings_pipeline(
    nlp_ready_df=df50_shuf,
    raw_text_col=TEXT_COL,
    summary_artifact_path="data/processed/llm_A/long_context_summaries_50_shuf.parquet",
    embeddings_artifact_path="data/processed/llm_A/embeddings_50_shuf.parquet",
    verbose=False,
)

emb_cols = [c for c in dfA.columns if c.startswith("emb_")]
A = dfA.sort_values("subject_id")[emb_cols].to_numpy()
B = dfB.sort_values("subject_id")[emb_cols].to_numpy()

sims = np.diag(cosine_similarity(A, B))
print("mean cosine:", sims.mean())
print("min cosine:", sims.min())
print("p10 cosine:", np.quantile(sims, 0.10))
