In [None]:
import numpy as np
import pandas as pd

def explain_prediction(pred_class, prob, shap_values, feature_names, top_k=5):
    """
    Generate a textual explanation for a single prediction.

    Args:
        pred_class (str): Predicted class label (e.g., "Sensitive" or "Resistant").
        prob (float): Probability/confidence of prediction.
        shap_values (array-like): SHAP values for the features (1D array for one sample).
        feature_names (list): Names of features corresponding to SHAP values.
        top_k (int): Number of top features to report.

    Returns:
        str: Explanation text.
    """
    # Build a DataFrame of feature importances
    df = pd.DataFrame({
        "Feature": feature_names,
        "SHAP": shap_values
    })

    # Sort by absolute contribution
    df["Abs_SHAP"] = df["SHAP"].abs()
    df_sorted = df.sort_values("Abs_SHAP", ascending=False).head(top_k)

    # Create explanation
    explanation = []
    explanation.append(f"ðŸ”Ž Prediction: {pred_class} (probability={prob:.2f})\n")
    explanation.append("Top contributing features:")
    for _, row in df_sorted.iterrows():
        direction = "â†‘ supports prediction" if row["SHAP"] > 0 else "â†“ opposes prediction"
        explanation.append(f"  - {row['Feature']}: {row['SHAP']:.3f} ({direction})")

    return "\n".join(explanation)


# ---------------- Example usage ----------------
# Suppose you have:
pred_class = "Resistant"
prob = 0.78
feature_names = ["TP53_mut", "EGFR_expr", "ABC_transporter", "KRAS_mut", "MYC_expr"]
shap_values = np.array([0.45, -0.22, 0.35, -0.05, 0.12])  # Example SHAP values

# Get explanation
print(explain_prediction(pred_class, prob, shap_values, feature_names))


In [None]:
import requests
import pandas as pd
import numpy as np

def ask_qwen(pred_class, prob, shap_values, feature_names, top_k=5, literature_context=True):
    """
    Use Qwen (via Ollama) to explain ML drug response predictions.
    """
    # Prepare feature importance
    df = pd.DataFrame({
        "Feature": feature_names,
        "SHAP": shap_values
    })
    df["Abs_SHAP"] = df["SHAP"].abs()
    top_features = df.sort_values("Abs_SHAP", ascending=False).head(top_k)

    # Build explanation text
    feature_list = "\n".join([
        f"- {row['Feature']} (SHAP={row['SHAP']:.3f})"
        for _, row in top_features.iterrows()
    ])

    # Construct prompt for Qwen
    prompt = f"""
You are an expert in pharmacogenomics and precision oncology.
The ML model predicted this sample as **{pred_class}** with probability {prob:.2f}.
The features drving influencing this decision were:
{feature_list}
Task:
1. Explain why these features might biologically support or oppose {pred_class} drug response.
2. Use knowledge from the scientific literature to provide context (e.g., known roles of genes, biomarkers, pathways).
3. Summarize the reasoning in a clear, human-readable way.
    """

    # Send to local Qwen via Ollama API
    response = requests.post(
        "http://localhost:11434/api/generate",
        json={"model": "qwen3:8b", "prompt": prompt}
    )

    # Ollama streams responses; collect text
    explanation = ""
    for line in response.iter_lines():
        if line:
            explanation += line.decode("utf-8")

    return explanation


In [None]:
ask_qwen("adverse effects", 0.78, shap_values, feature_names)

In [2]:
import pandas as pd

In [21]:
cytokines = pd.read_csv("/Users/SJp/Documents/project_local/VIB-LLM-SS/ml-summerschool-2025/PatStrat/data/Cytokines.csv", header=0, index_col=0  )
clinical = pd.read_csv("/Users/SJp/Documents/project_local/VIB-LLM-SS/ml-summerschool-2025/PatStrat/data/Clinical.csv", header=0, index_col=0  )
proteomics = pd.read_csv("/Users/SJp/Documents/project_local/VIB-LLM-SS/ml-summerschool-2025/PatStrat/data/MyelomaCells_proteomics.csv", header=0, index_col=0  )
drugresponse = pd.read_csv("/Users/SJp/Documents/project_local/VIB-LLM-SS/ml-summerschool-2025/PatStrat/data/DrugResponse.csv", header=0, index_col=0  )
 

In [26]:
#build a patient similarity network based on proteomics data
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(proteomics.transpose().fillna(0))
similarity_df = pd.DataFrame(similarity_matrix, index=proteomics.columns, columns=proteomics.columns)
similarity_df.head() 

#build a patient similarity network based on cytokine data
similarity_matrix_cytokines = cosine_similarity(cytokines.transpose().fillna(0))
similarity_df_cytokines = pd.DataFrame(similarity_matrix_cytokines, index=cytokines.columns, columns=cytokines.columns)
similarity_df_cytokines.head(), index=cytokines.columns, columns=cytokines.columns)
similarity_df_cytokines.head()


SyntaxError: unmatched ')' (383414193.py, line 10)

In [41]:
drugresp

Unnamed: 0.1,Unnamed: 0,MM013,MM014,MM015,MM017,MM018,MM019,MM021,MM023,MM024,...,MM123,MM125,MM126,MM127,MM129,MM132,MM133,MM135,MM137,MM138
0,Bendamustine_DNA-alkylating agent,-0.1,-0.05,-0.23,-0.15,-0.08,-0.01,0.14,-0.01,0.01,...,-0.15,0.02,-0.04,0.04,-0.28,-0.03,-0.1,0.0,-0.02,-0.05
1,Bortezomib_Proteasome Inhibitor regime,0.34,0.45,0.65,0.23,0.0,-0.24,0.28,0.18,0.09,...,0.14,0.27,0.25,0.07,0.48,0.24,0.09,0.15,0.31,0.21
2,Carfilzomib_Proteasome Inhibitor regime,0.48,0.57,0.66,0.36,0.19,0.41,0.6,0.27,0.34,...,0.22,0.43,0.21,0.1,0.43,0.25,0.11,0.19,0.32,0.28
3,Cisplatin_Platinum-containing,-0.17,-0.16,-0.27,-0.22,-0.14,-0.09,0.0,-0.03,-0.08,...,0.02,-0.2,-0.08,0.14,-0.14,0.04,0.05,-0.03,0.04,-0.01
4,Cyclophosphamide_DNA-alkylating agent,-0.15,0.03,-0.05,0.03,-0.08,-0.02,0.01,0.11,0.04,...,-0.17,-0.07,-0.06,-0.17,-0.23,-0.03,-0.15,-0.02,-0.03,-0.12
5,Cytarabine_Nucleoitide analog,0.19,-0.1,-0.13,-0.01,-0.13,0.12,0.2,0.07,0.07,...,0.09,0.08,0.14,-0.25,-0.03,0.09,-0.12,0.09,-0.02,-0.11
6,Dexamethasone_Corticosteroid,-0.1,0.45,0.19,-0.05,0.12,0.43,0.26,0.13,0.14,...,0.14,0.02,0.03,0.18,0.18,-0.03,0.17,0.13,0.25,0.05
7,Etoposide_Topoisomerase inhibitor,0.12,-0.08,0.16,-0.02,-0.08,0.34,0.23,0.18,-0.1,...,-0.35,0.08,0.02,-0.1,0.03,0.08,0.07,0.17,0.16,-0.02
8,Ixazomib_Proteasome Inhibitor regime,0.22,0.23,0.62,0.2,0.07,-0.81,-0.24,0.04,-0.04,...,0.15,-0.25,0.04,0.17,0.11,0.09,0.14,0.13,0.31,0.21
9,Lenalidomide_Corticosteroid + IMID,-0.09,0.17,-0.03,-0.13,-0.19,0.17,0.08,0.09,0.05,...,0.06,-0.11,-0.09,-0.08,-0.02,-0.07,0.12,0.03,-0.02,0.04


In [29]:
import json

# Using the already constructed JSON (built in my internal scratchpad)
# I will recreate the same structure here to ensure the file is saved for you.

import pandas as pd, numpy as np, math, os

# Load data
clinical = pd.read_csv("/Users/SJp/Documents/project_local/VIB-LLM-SS/ml-summerschool-2025/PatStrat/data/Clinical.csv")
cytokines = pd.read_csv("/Users/SJp/Documents/project_local/VIB-LLM-SS/ml-summerschool-2025/PatStrat/data/Cytokines.csv")
proteomics = pd.read_csv("/Users/SJp/Documents/project_local/VIB-LLM-SS/ml-summerschool-2025/PatStrat/data/MyelomaCells_proteomics.csv")
drugresp = pd.read_csv("/Users/SJp/Documents/project_local/VIB-LLM-SS/ml-summerschool-2025/PatStrat/data/DrugResponse.csv")

# Helper functions
def patient_ids(df, id_col_name=None):
    cols = set(df.columns)
    if id_col_name and id_col_name in cols:
        cols.remove(id_col_name)
    cols = {c for c in cols if not c.lower().startswith("unnamed")}
    return cols

clin_ids = patient_ids(clinical, "ID")
cyto_ids = patient_ids(cytokines, "Unnamed: 0")
prot_ids = patient_ids(proteomics, "Unnamed: 0")
drug_ids = patient_ids(drugresp, "Unnamed: 0")
common = sorted(list(clin_ids & cyto_ids & prot_ids & drug_ids))

# Choose 3 patients deterministically (first three in common set)
patients = common[:3]

# Prepare matrices
clin = clinical.set_index('ID').T
clin.index.name='patient'
clin_numeric = clin.apply(pd.to_numeric, errors='coerce')

cyto = cytokines.set_index('Unnamed: 0').T
cyto.index.name='patient'
prot = proteomics.set_index('Unnamed: 0').T
prot.index.name='patient'

# Restrict to common patients
clin_num_common = clin_numeric.loc[patients + [p for p in common if p not in patients]]
cyto_common = cyto.loc[common]
prot_common = prot.loc[common]

# Z-score clinical numeric
clin_num_z = (clin_num_common - clin_num_common.mean())/clin_num_common.std(ddof=0)
clin_num_z = clin_num_z.fillna(0.0)

# Fill NaNs for others
cyto_filled = cyto_common.fillna(0.0)
prot_filled = prot_common.fillna(0.0)

# Combined matrix for similarity
combined = pd.concat([clin_num_z.loc[common], cyto_filled, prot_filled], axis=1)

def cosine_sim(a, b):
    a = np.asarray(a, dtype=float)
    b = np.asarray(b, dtype=float)
    na = np.linalg.norm(a); nb = np.linalg.norm(b)
    if na == 0 or nb == 0:
        return 0.0
    return float(np.dot(a, b)/(na*nb))

def top_similar(patient_id, k=3):
    base = combined.loc[patient_id].values
    sims = []
    for pid in common:
        if pid == patient_id:
            continue
        sim = cosine_sim(base, combined.loc[pid].values)
        sims.append((pid, sim))
    sims.sort(key=lambda x: x[1], reverse=True)
    return sims[:k]

def dex_value(patient_id):
    row = drugresp[drugresp['Unnamed: 0']=='Dexamethasone_Corticosteroid']
    if row.empty or patient_id not in row.columns:
        return None
    return float(row.iloc[0][patient_id])

def prob_from_val(v, k=3.0):
    logits = np.array([k*v, 0.0, -k*v])
    exps = np.exp(logits - logits.max())
    probs = exps/exps.sum()
    return {'positive_effect': float(probs[0]), 'no_effect': float(probs[1]), 'adverse_effect': float(probs[2])}

def class_from_val(v, pos_thr=0.15, neg_thr=-0.15):
    if v >= pos_thr:
        return "positive_effect"
    elif v <= neg_thr:
        return "adverse_effect"
    else:
        return "no_effect"

# Categorical metadata keys
meta_keys = ["Treatment_stage","Gender","Kap_Lam_clonality","Ig_Subtype","Clonality","IGH_rearrangement","Hyperdiploidy","Hypodiploidy"]
clin_cat = clin.copy()

def build_patient_json(pid):
    v = dex_value(pid)
    probs = prob_from_val(v)
    pred_class = class_from_val(v)
    # clinical top (by absolute z-score)
    clin_items = []
    clin_vec = clin_num_z.loc[pid]
    top_clin_ids = clin_vec.abs().sort_values(ascending=False).head(8).index
    for fid in top_clin_ids:
        z = float(clin_vec[fid])
        raw_val = clin_num_common.loc[pid, fid]
        clin_items.append({"feature_id": str(fid), "value": float(raw_val) if not pd.isna(raw_val) else None, "zscore": round(z, 4), "shap": round(z * 0.05, 4)})
    # cytokines top
    cyto_vec = cyto_filled.loc[pid]
    top_cyto_ids = cyto_vec.abs().sort_values(ascending=False).head(8).index
    cyto_items = [{"feature_id": str(fid), "value": round(float(cyto_vec[fid]), 4), "shap": round(float(cyto_vec[fid]) * 0.03, 4)} for fid in top_cyto_ids]
    # proteomics top
    prot_vec = prot_filled.loc[pid]
    top_prot_ids = prot_vec.abs().sort_values(ascending=False).head(10).index
    prot_items = [{"feature_id": str(fid), "value": round(float(prot_vec[fid]), 4), "shap": round(float(prot_vec[fid]) * 0.01, 4)} for fid in top_prot_ids]
    # similarity edgelist
    neighbors = top_similar(pid, 3)
    edges = []
    for other, score in neighbors:
        meta = {}
        for k in meta_keys:
            if k in clin_cat.columns:
                src = clin_cat.loc[pid, k]
                tgt = clin_cat.loc[other, k]
            else:
                src = None; tgt = None
            same = (str(src) == str(tgt)) if (src is not None and tgt is not None and not (isinstance(src, float) and math.isnan(src)) and not (isinstance(tgt, float) and math.isnan(tgt))) else False
            meta[k] = {"source": None if (src is None or (isinstance(src, float) and math.isnan(src))) else str(src),
                       "target": None if (tgt is None or (isinstance(tgt, float) and math.isnan(tgt))) else str(tgt),
                       "same": bool(same)}
        edges.append({"source_patient": pid, "similar_patient": other, "similarity_score": round(float(score), 4), "metadata_overlap": meta})
    return {
        "patient_ID": pid,
        "drug_response_prediction": {
            "drug_name": "Dexamethasone_Corticosteroid",
            "raw_response_value": round(v, 4) if v is not None else None,
            "prediction_class": pred_class,
            "prediction_probability": {k: round(vv, 4) for k, vv in probs.items()}
        },
        "shap_values": {
            "clinical": clin_items,
            "cytokines": cyto_items,
            "proteomics": prot_items
        },
        "similarity_edgelist": edges
    }

output = [build_patient_json(pid) for pid in patients]

# Save to file
out_path = "mm_dexamethasone_predictions_P3.json"
with open(out_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"Saved JSON for patients {patients} to {out_path}")


Saved JSON for patients ['MM021', 'MM023', 'MM025'] to mm_dexamethasone_predictions_P3.json


In [38]:
def build_patient_json(pid):
    drug_predictions = []
    for drug_name in drug_list:
        v = get_drug_value(drug_name, pid)
        if v is None:
            drug_predictions.append({
                "drug_name": drug_name,
                "prediction_class": None,
                "prediction_probability": None
            })
            continue
        probs = prob_from_val(v)
        pred_class = class_from_val(v)
        drug_predictions.append({
            "drug_name": drug_name,
            "prediction_class": pred_class,
            "prediction_probability": {k: round(vv, 4) for k, vv in probs.items()}
        })
    # clinical top (by absolute z-score)
    clin_vec = clin_num_z.loc[pid]
    top_clin_ids = clin_vec.abs().sort_values(ascending=False).head(10).index
    clin_items = [{"feature_id": str(fid), "shap_value": round(clin_vec[fid] * 0.05, 4)} for fid in top_clin_ids]
    # cytokines top
    cyto_vec = cyto_filled.loc[pid]
    top_cyto_ids = cyto_vec.abs().sort_values(ascending=False).head(50).index
    cyto_items = [{"feature_id": str(fid), "shap_value": round(float(cyto_vec[fid]) * 0.03, 4)} for fid in top_cyto_ids]
    # proteomics top
    prot_vec = prot_filled.loc[pid]
    top_prot_ids = prot_vec.abs().sort_values(ascending=False).head(100).index
    prot_items = [{"feature_id": str(fid), "shap_value": round(float(prot_vec[fid]) * 0.01, 4)} for fid in top_prot_ids]
    return {
        "patient_ID": pid,
        "drug_response_predictions": drug_predictions,
        "shap_values": {
            "clinical": clin_items,
            "cytokines": cyto_items,
            "proteomics": prot_items
        }
    }

output = [build_patient_json(pid) for pid in patients]

# Save to file
out_path = "LLM_input.json"
with open(out_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"Saved JSON for patients {patients} to {out_path}")

Saved JSON for patients ['MM021', 'MM023', 'MM025'] to LLM_input.json


In [40]:
def build_patient_json(pid):
    drug_predictions = []
    for drug_name in drug_list:
        v = get_drug_value(drug_name, pid)
        if v is None:
            # Set to "unknown" and zero probabilities if missing
            drug_predictions.append({
                "drug_name": drug_name,
                "prediction_class": "unknown",
                "prediction_probability": {
                    "positive_effect": 0.0,
                    "no_effect": 0.0,
                    "adverse_effect": 0.0
                }
            })
            continue
        probs = prob_from_val(v)
        pred_class = class_from_val(v)
        drug_predictions.append({
            "drug_name": drug_name,
            "prediction_class": pred_class,
            "prediction_probability": {k: round(vv, 4) for k, vv in probs.items()}
        })
    # clinical top (by absolute z-score)
    clin_vec = clin_num_z.loc[pid]
    top_clin_ids = clin_vec.abs().sort_values(ascending=False).head(10).index
    clin_items = [{"feature_id": str(fid), "shap_value": round(clin_vec[fid] * 0.05, 4)} for fid in top_clin_ids]
    # cytokines top
    cyto_vec = cyto_filled.loc[pid]
    top_cyto_ids = cyto_vec.abs().sort_values(ascending=False).head(50).index
    cyto_items = [{"feature_id": str(fid), "shap_value": round(float(cyto_vec[fid]) * 0.03, 4)} for fid in top_cyto_ids]
    # proteomics top
    prot_vec = prot_filled.loc[pid]
    top_prot_ids = prot_vec.abs().sort_values(ascending=False).head(100).index
    prot_items = [{"feature_id": str(fid), "shap_value": round(float(prot_vec[fid]) * 0.01, 4)} for fid in top_prot_ids]
    return {
        "patient_ID": pid,
        "drug_response_predictions": drug_predictions,
        "shap_values": {
            "clinical": clin_items,
            "cytokines": cyto_items,
            "proteomics": prot_items
        }
    }

output = [build_patient_json(pid) for pid in patients]

# Save to file
out_path = "LLM_input.json"
with open(out_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"Saved JSON for patients {patients} to {out_path}")

Saved JSON for patients ['MM021', 'MM023', 'MM025'] to LLM_input.json


In [50]:
# Create a new JSON with ONLY 3 drugs (deterministic selection) from the previously filtered set.
# The three drugs are chosen alphabetically from the global filtered list for reproducibility.
# Output: LLM_input_filtered_all_three_classes_GLOBAL_simulated_3drugs.json

import json
import os
import pandas as pd
import numpy as np

np.random.seed(42)

DATA_DIR = "/Users/SJp/Documents/project_local/VIB-LLM-SS/ml-summerschool-2025/PatStrat/data"

# Load data
clinical = pd.read_csv(os.path.join(DATA_DIR, "Clinical.csv"))
cytokines = pd.read_csv(os.path.join(DATA_DIR, "Cytokines.csv"))
proteomics = pd.read_csv(os.path.join(DATA_DIR, "MyelomaCells_proteomics.csv"))
drugresp = pd.read_csv(os.path.join(DATA_DIR, "DrugResponse.csv"))

def patient_ids(df, id_col_name=None):
    cols = set(df.columns)
    if id_col_name and id_col_name in cols:
        cols.remove(id_col_name)
    cols = {c for c in cols if not c.lower().startswith("unnamed")}
    return cols

clin_ids = patient_ids(clinical, "ID")
cyto_ids = patient_ids(cytokines, "Unnamed: 0")
prot_ids = patient_ids(proteomics, "Unnamed: 0")
drug_ids = patient_ids(drugresp, "Unnamed: 0")
common = sorted(list(clin_ids & cyto_ids & prot_ids & drug_ids))

patients = common[:3]

clin = clinical.set_index('ID').T
clin.index.name='patient'
clin_numeric = clin.apply(pd.to_numeric, errors='coerce')

cyto = cytokines.set_index('Unnamed: 0').T
cyto.index.name='patient'
prot = proteomics.set_index('Unnamed: 0').T
prot.index.name='patient'

clin_num_common = clin_numeric.loc[patients + [p for p in common if p not in patients]]
cyto_common = cyto.loc[common]
prot_common = prot.loc[common]

clin_num_z = (clin_num_common - clin_num_common.mean())/clin_num_common.std(ddof=0)
clin_num_z = clin_num_z.fillna(0.0)

cyto_filled = cyto_common.fillna(0.0)
prot_filled = prot_common.fillna(0.0)

drug_list = drugresp['Unnamed: 0'].astype(str).tolist()

def get_drug_value(drug_name, patient_id):
    row = drugresp[drugresp['Unnamed: 0']==drug_name]
    if row.empty or patient_id not in row.columns:
        return None
    try:
        return float(row.iloc[0][patient_id])
    except Exception:
        return None

def class_from_val(v, pos_thr=0.15, neg_thr=-0.15):
    if v is None or (isinstance(v, float) and np.isnan(v)):
        return None
    if v >= pos_thr:
        return "positive_effect"
    elif v <= neg_thr:
        return "adverse_effect"
    else:
        return "no_effect"

# Filter drugs that have ALL THREE classes across entire cohort
required_classes = {"positive_effect", "no_effect", "adverse_effect"}
global_filtered_drugs = []
for drug in drug_list:
    classes = set()
    complete = True
    for pid in common:
        v = get_drug_value(drug, pid)
        c = class_from_val(v)
        if c is None:
            complete = False
            break
        classes.add(c)
    if complete and classes == required_classes:
        global_filtered_drugs.append(drug)

# Pick 3 drugs deterministically (alphabetical)
selected_drugs = sorted(global_filtered_drugs)[:3]

# Probability simulator (one true class, others random; sum==1.0)
CLASSES = ["positive_effect", "no_effect", "adverse_effect"]
def simulate_probs(true_class, p_true_min=0.6, p_true_max=0.95, decimals=4):
    p_true = np.random.uniform(p_true_min, p_true_max)
    rest = 1.0 - p_true
    r = np.random.rand(2)
    r = r / r.sum() if r.sum() > 0 else np.array([0.5, 0.5])
    p_other = rest * r
    probs = {c: 0.0 for c in CLASSES}
    others = [c for c in CLASSES if c != true_class]
    probs[true_class] = p_true
    probs[others[0]] = p_other[0]
    probs[others[1]] = p_other[1]
    rounded = {c: round(probs[c], decimals) for c in CLASSES}
    s = sum(rounded[c] for c in CLASSES[:-1])
    rounded[CLASSES[-1]] = round(1.0 - s, decimals)
    return rounded

def build_patient_json(pid):
    # clinical features
    clin_vec = clin_num_z.loc[pid]
    top_clin_ids = clin_vec.abs().sort_values(ascending=False).head(10).index
    clin_items = [{"feature_id": str(fid),
                   "value": None if pd.isna(clin_num_common.loc[pid, fid]) else float(clin_num_common.loc[pid, fid]),
                   "zscore": round(float(clin_vec[fid]), 4),
                   "shap_value": round(float(clin_vec[fid]) * 0.05, 4)} for fid in top_clin_ids]

    # cytokines
    cyto_vec = cyto_filled.loc[pid]
    top_cyto_ids = cyto_vec.abs().sort_values(ascending=False).head(50).index
    cyto_items = [{"feature_id": str(fid),
                   "shap_value": round(float(cyto_vec[fid]) * 0.03, 4)} for fid in top_cyto_ids]

    # proteomics
    prot_vec = prot_filled.loc[pid]
    top_prot_ids = prot_vec.abs().sort_values(ascending=False).head(100).index
    prot_items = [{"feature_id": str(fid),
                   "shap_value": round(float(prot_vec[fid]) * 0.01, 4)} for fid in top_prot_ids]

    # predictions for only the 3 selected drugs
    drug_predictions = []
    for drug_name in selected_drugs:
        v = get_drug_value(drug_name, pid)
        true_class = class_from_val(v)
        if true_class is None:
            # Shouldn't happen after filtering; in case of missing data, choose a class randomly
            true_class = np.random.choice(CLASSES)
        probs = simulate_probs(true_class)
        drug_predictions.append({
            "drug_name": drug_name,
            "raw_response_value": None if v is None else round(float(v), 4),
            "prediction_class_true": true_class,
            "prediction_probability": probs
        })

    return {
        "patient_ID": pid,
        "drug_response_predictions": drug_predictions,
        "shap_values": {
            "clinical": clin_items,
            "cytokines": cyto_items,
            "proteomics": prot_items
        }
    }

output = [build_patient_json(pid) for pid in patients]

out_path = "LLM_input.json"
with open(out_path, "w") as f:
    json.dump(output, f, indent=2)

print("Patients:", patients)
print("Selected 3 drugs:", selected_drugs)
print("Saved:", out_path)


Patients: ['MM021', 'MM023', 'MM025']
Selected 3 drugs: ['Carfilzomib_Proteasome Inhibitor regime', 'Cisplatin_Platinum-containing', 'Cyclophosphamide_DNA-alkylating agent']
Saved: LLM_input.json
