In [58]:
import pandas as pd
import numpy as np
import pyreadr
import json
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_recall_fscore_support
import matplotlib.pyplot as plt
from typing import List, Optional, Sequence, Tuple, Union, Dict
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model as keras_load_model
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
import os
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn.metrics import classification_report, confusion_matrix, f1_score

import keras_tuner as kt
from dataclasses import dataclass, asdict
import random
from datetime import datetime
import joblib
from tensorflow.keras.models import load_model





Step 1: We load the expression, gene and sample data we saved after the pre-processing we applied in R. We aim to predict tumor grade from the RNAseq data, so we examine potential class imbalance in the tumor grade sample metadata column, and we remove NA values in tumors with unidentified grade.

In [32]:
#load expression, genes and samples data
expression=pd.read_csv("C:/Users/joann/Desktop/M2/Deep_Learning/merged_expression.csv",index_col=0)
genes=pd.read_csv("C:/Users/joann/Desktop/M2/Deep_Learning/merged_genes.csv")
samples = pyreadr.read_r("C:/Users/joann/Desktop/M2/Deep_Learning/merged_samples.rds")
samples = samples[None]

In [33]:
#check what the expression, genes and samples data look like
print(expression.columns[:5])
print(expression.head(2))
print(samples.head(2))

Index(['ENSG00000276168.1', 'ENSG00000129824.16', 'ENSG00000133048.13',
       'ENSG00000012223.13', 'ENSG00000198695.2'],
      dtype='object')
                              ENSG00000276168.1  ENSG00000129824.16  \
TCGA-HT-7468-01A-11R-2027-07           8.423027           13.253145   
TCGA-DU-7015-01A-11R-2027-07           8.252623            5.532866   

                              ENSG00000133048.13  ENSG00000012223.13  \
TCGA-HT-7468-01A-11R-2027-07            7.491808            4.879747   
TCGA-DU-7015-01A-11R-2027-07            8.707316            5.087127   

                              ENSG00000198695.2  ENSG00000228253.1  \
TCGA-HT-7468-01A-11R-2027-07          15.950897          12.763416   
TCGA-DU-7015-01A-11R-2027-07          16.922914          13.431528   

                              ENSG00000198763.3  ENSG00000133110.15  \
TCGA-HT-7468-01A-11R-2027-07          17.025877            5.074539   
TCGA-DU-7015-01A-11R-2027-07          17.670450            4.085189   


In [34]:
#explore the samples metadata
print(samples.shape)
# list of column names
samples.columns.tolist()

#check for potential class imbalance in tumor grade
print(samples["tumor_grade"].value_counts())

#count how many NA values are in the tumor grade column
print(samples["tumor_grade"].isna().sum())

(925, 114)
tumor_grade
G4    391
G3    216
G2    211
Name: count, dtype: int64
107


In [35]:
#we exclude samples with NA tumor grade
samples_all = samples.copy() #keep original samples dataframe
expression_all = expression.copy() #keep original expression dataframe

samples_sup = samples_all.dropna(subset=["tumor_grade"])
expression_sup = expression_all.loc[samples_sup.index]

assert expression_sup.shape[0] == samples_sup.shape[0], "Mismatch in number of samples between expression and samples dataframes after dropping NA tumor grades."



Step 2: We configure the model and ensure reproducibility.

In [40]:
@dataclass
class Config:
    out_dir: str = "C:/Users/joann/Desktop/M2/Deep_Learning/MLP_run"
    seed: int = 42

    # labels
    label_col: str = "tumor_grade"
    label_map: dict = None  # set in main if None

    # splits
    test_size: float = 0.15
    n_folds: int = 9  # 9-fold CV on train+val portion

    # training/tuning
    max_epochs: int = 100
    early_stop_patience: int = 5

    # tuner
    hyperband_factor: int = 3
    objective: str = "val_accuracy"

def set_global_seed(seed: int) -> None:
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

    # makes TF ops more deterministic when possible
    try:
        tf.config.experimental.enable_op_determinism()
    except Exception:
        pass

def make_run_dir(base_dir: str) -> str:
    ts = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_dir = os.path.join(base_dir, ts)
    os.makedirs(run_dir, exist_ok=True)
    return run_dir

Step 3: We assign the classes. Grade II tumors will be 0, grade III will be 1 and grade IV will be 2.

In [41]:
def make_y(samples_sup: pd.DataFrame, label_col: str, label_map: dict) -> np.ndarray:
    """
    Convert tumor grade strings to integer classes.
    Example mapping: {"G2":0, "G3":1, "G4":2}
    """
    y = samples_sup[label_col].astype(str).map(label_map)
    if y.isna().any():
        bad = samples_sup.loc[y.isna(), label_col].unique()
        raise ValueError(f"Found unmapped labels: {bad}")
    return y.astype(int).to_numpy()


Step 4: We construct our  splits: we will have a hold-out test set, and run a 9-fold cross-validation on the rest of the data. In the cross-validation we include training and validation tests

In [42]:
def split_holdout_test(X: pd.DataFrame, y: np.ndarray, cfg: Config):
    """
    Hold out ONE stratified test set.
    Remaining data is used for 9-fold CV (train/val folds).
    """
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y,
        test_size=cfg.test_size,
        random_state=cfg.seed,
        stratify=y
    )
    return X_trainval, X_test, y_trainval, y_test

def save_test_ids(run_dir: str, X_test: pd.DataFrame) -> None:
    pd.Index(X_test.index).to_series().to_csv(os.path.join(run_dir, "test_ids.csv"), index=False)

def save_fold_ids(run_dir: str, fold: int, X_train: pd.DataFrame, X_val: pd.DataFrame) -> None:
    fold_dir = os.path.join(run_dir, f"fold_{fold:02d}")
    os.makedirs(fold_dir, exist_ok=True)
    pd.Index(X_train.index).to_series().to_csv(os.path.join(fold_dir, "train_ids.csv"), index=False)
    pd.Index(X_val.index).to_series().to_csv(os.path.join(fold_dir, "val_ids.csv"), index=False)

Step 5: We standardize the data, fitting on test data only to avoid data leakage

In [43]:
def standardize_train_val(X_train: pd.DataFrame, X_val: pd.DataFrame):
    scaler = StandardScaler(with_mean=True, with_std=True)
    X_train_s = scaler.fit_transform(X_train).astype(np.float32)
    X_val_s = scaler.transform(X_val).astype(np.float32)
    return X_train_s, X_val_s, scaler

Step 6: We build our multi-layer perceptron model (MLP)

In [38]:
def build_model(hp: kt.HyperParameters, input_dim: int, num_classes: int = 3) -> keras.Model:
    """
    Build an MLP for 3-class classification with tunable hyperparameters.
    """
    #batch size 
    hp.Choice("batch_size", [16, 32, 64, 128])  # registered for the tuner

    # architecture choices
    n_layers = hp.Int("n_layers", 1, 3)
    units = hp.Choice("units", [64, 128, 256, 512])
    dropout = hp.Float("dropout", 0.0, 0.5, step=0.1)
    l2_strength = hp.Choice("l2", [0.0, 1e-5, 1e-4, 1e-3])

    # optimizer choice
    lr = hp.Choice("lr", [1e-4, 3e-4, 1e-3, 3e-3])

    
    inputs = keras.Input(shape=(input_dim,), name="expression")
    x = inputs

    for i in range(n_layers):
        x = layers.Dense(
            units=units,
            activation="relu",
            kernel_regularizer=regularizers.l2(l2_strength),
            name=f"dense_{i+1}"
        )(x)
        if dropout > 0:
            x = layers.Dropout(dropout, name=f"dropout_{i+1}")(x)

    outputs = layers.Dense(num_classes, activation="softmax", name="softmax")(x)
    model = keras.Model(inputs, outputs, name="MLP_grade")

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

Step 7: We tune the hyper-paramaters to choose the best model and avoid overfitting

In [44]:
class HyperbandWithBatchSize(kt.Hyperband):
    """Hyperband tuner that automatically uses hp['batch_size'] when fitting."""
    def run_trial(self, trial, *args, **kwargs):
        hp = trial.hyperparameters
        kwargs["batch_size"] = hp.get("batch_size")
        return super().run_trial(trial, *args, **kwargs)

def tune_hyperparameters(X_train_s, y_train, X_val_s, y_val, input_dim: int, run_dir: str, cfg: Config):
    tuner_dir = os.path.join(run_dir, "tuner")
    os.makedirs(tuner_dir, exist_ok=True)

    tuner = HyperbandWithBatchSize(
        hypermodel=lambda hp: build_model(hp, input_dim=input_dim, num_classes=3),
        objective=kt.Objective(cfg.objective, direction="max"),
        max_epochs=cfg.max_epochs,
        factor=cfg.hyperband_factor,
        directory=tuner_dir,
        project_name="grade_mlp"
    )

    early_stop = keras.callbacks.EarlyStopping(
        monitor="val_accuracy",
        mode="max",
        patience=cfg.early_stop_patience,
        restore_best_weights=True
    )

    tuner.search(
        X_train_s, y_train,
        validation_data=(X_val_s, y_val),
        epochs=cfg.max_epochs,
        callbacks=[early_stop],
        verbose=1
    )

    best_hp = tuner.get_best_hyperparameters(1)[0]
    return tuner, best_hp

Step 8: We choose the best model to train

In [45]:
def train_best(X_train_s, y_train, X_val_s, y_val, input_dim: int, best_hp, run_dir: str, cfg: Config):
    model = build_model(best_hp, input_dim=input_dim, num_classes=3)

    ckpt_path = os.path.join(run_dir, "best_model.keras")

    early_stop = keras.callbacks.EarlyStopping(
        monitor="val_accuracy",
        mode="max",
        patience=cfg.early_stop_patience,
        restore_best_weights=True
    )

    checkpoint = keras.callbacks.ModelCheckpoint(
        filepath=ckpt_path,
        monitor="val_loss",
        save_best_only=True
    )

    history = model.fit(
        X_train_s, y_train,
        validation_data=(X_val_s, y_val),
        epochs=cfg.max_epochs,
        batch_size=best_hp.get("batch_size"),
        callbacks=[early_stop, checkpoint],
        verbose=1
    )

    with open(os.path.join(run_dir, "history.json"), "w") as f:
        json.dump(history.history, f, indent=2)

    return model, ckpt_path

Step 9: We evaluate the model

In [52]:
def evaluate(model: keras.Model, X_eval_s, y_eval, run_dir: str, prefix: str = "eval"):
    eval_loss, eval_acc = model.evaluate(X_eval_s, y_eval, verbose=0)

    probs = model.predict(X_eval_s, verbose=0)
    y_pred = np.argmax(probs, axis=1)

    report = classification_report(y_eval, y_pred, output_dict=True)
    cm = confusion_matrix(y_eval, y_pred).tolist()
    macro_f1 = float(f1_score(y_eval, y_pred, average="macro"))

    results = {
        "loss": float(eval_loss),
        "accuracy": float(eval_acc),
        "macro_f1": macro_f1,
        "classification_report": report,
        "confusion_matrix": cm
    }

    with open(os.path.join(run_dir, f"{prefix}_metrics.json"), "w") as f:
        json.dump(results, f, indent=2)

    return results

Step 10: We save all the run metadata

In [47]:
def save_run_metadata(run_dir: str, cfg: Config, best_hp, scaler):
    with open(os.path.join(run_dir, "config.json"), "w") as f:
        json.dump(asdict(cfg), f, indent=2)

    with open(os.path.join(run_dir, "best_hyperparameters.json"), "w") as f:
        json.dump(best_hp.values, f, indent=2)

    joblib.dump(scaler, os.path.join(run_dir, "scaler.joblib"))

    versions = {
        "python": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
        "numpy": np.__version__,
        "pandas": pd.__version__,
        "tensorflow": tf.__version__,
        "keras_tuner": kt.__version__
    }
    with open(os.path.join(run_dir, "versions.json"), "w") as f:
        json.dump(versions, f, indent=2)

Final Step: Main run script

In [53]:
def main(expression_sup: pd.DataFrame, samples_sup: pd.DataFrame):
    cfg = Config()
    if cfg.label_map is None:
        cfg.label_map = {"G2": 0, "G3": 1, "G4": 2}

    set_global_seed(cfg.seed)
    run_dir = make_run_dir(cfg.out_dir)

    # 1) labels
    y = make_y(samples_sup, cfg.label_col, cfg.label_map)

    # 2) hold-out test
    X_trainval, X_test, y_trainval, y_test = split_holdout_test(expression_sup, y, cfg)
    save_test_ids(run_dir, X_test)

    # 3) 9-fold CV on trainval
    skf = StratifiedKFold(n_splits=cfg.n_folds, shuffle=True, random_state=cfg.seed)

    cv_results = []
    best_hp = None

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X_trainval, y_trainval), start=1):
        fold_dir = os.path.join(run_dir, f"fold_{fold:02d}")
        os.makedirs(fold_dir, exist_ok=True)

        X_tr = X_trainval.iloc[tr_idx]
        X_va = X_trainval.iloc[va_idx]
        y_tr = y_trainval[tr_idx]
        y_va = y_trainval[va_idx]

        save_fold_ids(run_dir, fold, X_tr, X_va)

        # standardize per-fold 
        X_tr_s, X_va_s, _scaler_fold = standardize_train_val(X_tr, X_va)

        # tune once and reuse HP for all folds to save time
        if best_hp is None:
            tuner, best_hp = tune_hyperparameters(
                X_tr_s, y_tr,
                X_va_s, y_va,
                input_dim=X_tr_s.shape[1],
                run_dir=fold_dir,
                cfg=cfg
            )

            with open(os.path.join(run_dir, "best_hyperparameters.json"), "w") as f:
                json.dump(best_hp.values, f, indent=2)

        # train best HP on this fold
        model, ckpt_path = train_best(
            X_tr_s, y_tr,
            X_va_s, y_va,
            input_dim=X_tr_s.shape[1],
            best_hp=best_hp,
            run_dir=fold_dir,
            cfg=cfg
        )

        # evaluate on fold validation
        fold_metrics = evaluate(model, X_va_s, y_va, fold_dir, prefix="val")

        cv_results.append({
            "fold": fold,
            "val_accuracy": fold_metrics["accuracy"],
            "val_macro_f1": fold_metrics["macro_f1"],
            "model_path": ckpt_path
        })

    with open(os.path.join(run_dir, "cv_results.json"), "w") as f:
        json.dump(cv_results, f, indent=2)

    # 4) final training on all trainval with a small internal val split for early stopping
    X_tr_final, X_va_final, y_tr_final, y_va_final = train_test_split(
        X_trainval, y_trainval,
        test_size=0.15,
        random_state=cfg.seed,
        stratify=y_trainval
    )

    scaler_final = StandardScaler(with_mean=True, with_std=True)
    X_tr_final_s = scaler_final.fit_transform(X_tr_final).astype(np.float32)
    X_va_final_s = scaler_final.transform(X_va_final).astype(np.float32)
    X_test_s = scaler_final.transform(X_test).astype(np.float32)

    final_dir = os.path.join(run_dir, "final")
    os.makedirs(final_dir, exist_ok=True)

    model_final, ckpt_path_final = train_best(
        X_tr_final_s, y_tr_final,
        X_va_final_s, y_va_final,
        input_dim=X_tr_final_s.shape[1],
        best_hp=best_hp,
        run_dir=final_dir,
        cfg=cfg
    )

    # 5) evaluate once on held-out test
    test_results = evaluate(model_final, X_test_s, y_test, final_dir, prefix="test")

    # 6) save metadata + final scaler
    save_run_metadata(final_dir, cfg, best_hp, scaler_final)

    # print summary
    val_accs = [r["val_accuracy"] for r in cv_results]
    val_f1s = [r["val_macro_f1"] for r in cv_results]

    print("Saved run to:", run_dir)
    print("Final model:", ckpt_path_final)
    print(f"CV val accuracy: mean={np.mean(val_accs):.4f} std={np.std(val_accs):.4f}")
    print(f"CV val macro F1: mean={np.mean(val_f1s):.4f} std={np.std(val_f1s):.4f}")
    print("Test accuracy:", test_results["accuracy"])
    print("Test macro F1:", test_results["macro_f1"])

    return test_results, run_dir

In [54]:
#Now we call the main function to run the entire pipeline
results, run_dir = main(expression_sup, samples_sup)


Trial 254 Complete [00h 00m 17s]
val_accuracy: 0.807692289352417

Best val_accuracy So Far: 0.8589743375778198
Total elapsed time: 01h 07m 01s
Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.5365 - loss: 1.3546 - val_accuracy: 0.7179 - val_loss: 0.7063
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.6742 - loss: 0.8854 - val_accuracy: 0.7308 - val_loss: 0.6638
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7585 - loss: 0.7054 - val_accuracy: 0.7179 - val_loss: 0.6273
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7909 - loss: 0.6180 - val_accuracy: 0.7179 - val_loss: 0.6107
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7877 - loss: 0.5945 - val_accuracy: 0.7436 - val_loss: 0.6124
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━

In [56]:
#we can now access the saved metrics to produce evaluation reports and plots
run_dir = r"C:/Users/joann/Desktop/M2/Deep_Learning/MLP_run/20251229_120944" 
final_dir = os.path.join(run_dir, "final")

  

# Load metrics
with open(os.path.join(final_dir, "test_metrics.json"), "r") as f:
    metrics = json.load(f)

cm = np.array(metrics["confusion_matrix"])
class_names = ["G2", "G3", "G4"]  # must match your label mapping

def plot_confusion_matrix(cm, class_names, out_path):
    fig, ax = plt.subplots()
    im = ax.imshow(cm)

    ax.set_xticks(range(len(class_names)))
    ax.set_yticks(range(len(class_names)))
    ax.set_xticklabels(class_names, rotation=45, ha="right")
    ax.set_yticklabels(class_names)

    ax.set_xlabel("Predicted")
    ax.set_ylabel("True")
    ax.set_title("Confusion Matrix")

    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, str(cm[i, j]), ha="center", va="center")

    fig.colorbar(im, ax=ax)
    fig.tight_layout()
    fig.savefig(out_path, dpi=200)
    plt.close(fig)

plot_confusion_matrix(cm, class_names, os.path.join(run_dir, "confusion_matrix.png"))
print("Saved:", os.path.join(run_dir, "confusion_matrix.png"))


Saved: C:/Users/joann/Desktop/M2/Deep_Learning/MLP_run/20251229_120944\confusion_matrix.png


In [None]:
#look at the best model 
run_dir = r"C:/Users/joann/Desktop/M2/Deep_Learning/MLP_run/20251229_120944"
model_path = os.path.join(run_dir, "final", "best_model.keras")

model = load_model(model_path)
model.summary()