In [1]:
import os, random, numpy as np, pandas as pd
from sklearn.metrics import roc_auc_score
import tensorflow as tf

from DL.DNN.DNN import DNN
from DL.FDR.FDR_control import FDR_control

2025-11-03 13:49:37.685466: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-03 13:49:37.690532: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-03 13:49:37.709121: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-03 13:49:37.709138: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-03 13:49:37.709916: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [2]:
# ----------------------------
# Paths (edit to match your files)
# ----------------------------
TRAIN_DIR = "/ihome/hpark/jis117/knockoff_combat2/"
TEST_DIR  = "/ihome/hpark/jis117/knockoff_combat2/"

X_TRAIN_FILE = "train_big_equi_knockoffs_70_25_pca.csv"
Y_TRAIN_FILE = "meta_train_70_pca.csv"
X_TEST_FILE  = "test_big_equi_knockoffs_30_25_pca.csv"
Y_TEST_FILE  = "meta_test_30_pca.csv"

RESULT_BASEDIR = "./dnn_featout_pitt"

LABEL_COL = None  # set if your Y files have a column name

In [3]:
# ----------------------------
# Helpers (same as before)
# ----------------------------
def set_global_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

def load_labels(csv_path, label_col=None):
    df = pd.read_csv(csv_path)
    if label_col is None:
        s = df.iloc[:, 0]
    else:
        s = df[label_col]
    return s.to_numpy().astype(float).reshape(-1)

def build_x3d_from_knockoff(df, keep_cols=None):
    n, m = df.shape
    p = m // 2
    orig_cols = list(df.columns[:p])
    if keep_cols is None:
        sel_idx = list(range(p))
        used_cols = orig_cols
    else:
        pos = {c: i for i, c in enumerate(orig_cols)}
        used_cols = [c for c in keep_cols if c in pos]
        sel_idx = [pos[c] for c in used_cols]
    X_orig = df.iloc[:, sel_idx].to_numpy()
    X_knkf = df.iloc[:, p + np.array(sel_idx)].to_numpy()
    x3d = np.zeros((X_orig.shape[0], len(sel_idx), 2), dtype=float)
    x3d[:, :, 0] = X_orig
    x3d[:, :, 1] = X_knkf
    return x3d, used_cols, sel_idx

def write_original_only_csv_from_knockoff(df, out_path):
    p = df.shape[1] // 2
    df.iloc[:, :p].to_csv(out_path, index=False)

In [4]:
# ----------------------------
# Load data once
# ----------------------------
Xtr_df = pd.read_csv(os.path.join(TRAIN_DIR, X_TRAIN_FILE))
ytr = load_labels(os.path.join(TRAIN_DIR, Y_TRAIN_FILE), LABEL_COL)
Xte_df = pd.read_csv(os.path.join(TEST_DIR, X_TEST_FILE))
yte = load_labels(os.path.join(TEST_DIR, Y_TEST_FILE), LABEL_COL)

train_orig_cols = list(Xtr_df.columns[:Xtr_df.shape[1] // 2])

In [5]:
# ----------------------------
# Run 10 seeds
# ----------------------------
aucs = []
n_selected_list = []

for run in range(10):
    seed = 1000 + run
    set_global_seed(seed)
    print(f"\n=== Run {run+1}/10 with seed {seed} ===")

    # === Step 1: Train full DNN on Pitt (for FDR weights) ===
    x3d_train_full, used_cols_full, _ = build_x3d_from_knockoff(Xtr_df, keep_cols=train_orig_cols)
    p_num_full = x3d_train_full.shape[1]
    coeff = 0.05 * np.sqrt(2.0 * np.log(max(p_num_full, 1)) / max(x3d_train_full.shape[0], 1))

    dnn_full = DNN(num_epochs=20, batch_size=30, output_layer_activation='sigmoid')
    model_full = dnn_full.build_DNN(p_num_full, n_outputs=1, coeff=coeff)
    model_full.compile(loss="binary_crossentropy", optimizer="adam", metrics=["AUC"])

    run_dir = os.path.join(RESULT_BASEDIR, f"run{run+1}")
    os.makedirs(run_dir, exist_ok=True)
    cb = DNN.Job_finish_Callback(run_dir, p_num_full)
    model_full.fit(x3d_train_full, ytr, epochs=20, batch_size=30, verbose=0, callbacks=[cb])

    # === Step 2: FDR feature selection ===
    tmp_orig_csv = os.path.join(run_dir, "pitt_orig.csv")
    write_original_only_csv_from_knockoff(Xtr_df, tmp_orig_csv)
    fdr = FDR_control()
    selected = fdr.controlFilter(tmp_orig_csv, run_dir, offset=1, q=0.05)
    selected_features = [f for f, stat in selected]

    n_selected = len(selected_features)
    n_selected_list.append(n_selected)
    print(f"Run {run+1}: {n_selected} features selected")
    
    selected_csv_path = os.path.join(run_dir, f"selected_features_run{run+1}.csv")
    df_sel = pd.DataFrame({"feature": selected_features})
    df_sel.to_csv(selected_csv_path, index=False)
    print(f"Saved selected features to: {selected_csv_path}")

    if n_selected == 0:
        print("No features selected; skipping this run")
        aucs.append(np.nan)
        continue

    # === Step 3: Restrict data to selected features ===
    x3d_train_sel, _, _ = build_x3d_from_knockoff(Xtr_df, keep_cols=selected_features)
    x3d_test_sel, _, _  = build_x3d_from_knockoff(Xte_df, keep_cols=selected_features)

    # === Step 4: Retrain DNN on selected features ===
    p_num_sel = x3d_train_sel.shape[1]
    coeff_sel = 0.05 * np.sqrt(2.0 * np.log(max(p_num_sel, 1)) / max(x3d_train_sel.shape[0], 1))
    dnn_sel = DNN(num_epochs=20, batch_size=30, output_layer_activation='sigmoid')
    model_sel = dnn_sel.build_DNN(p_num_sel, n_outputs=1, coeff=coeff_sel)
    model_sel.compile(loss="binary_crossentropy", optimizer="adam", metrics=["AUC"])
    model_sel.fit(x3d_train_sel, ytr, epochs=20, batch_size=30, verbose=0)

    # === Step 5: AUC on non-Pitt ===
    yprob = model_sel.predict(x3d_test_sel).reshape(-1)
    auc = roc_auc_score(yte, yprob)
    aucs.append(auc)
    print(f"Run {run+1}: AUC = {auc:.4f}")


=== Run 1/10 with seed 1000 ===
__init__parameters
[layer]: Input	[shape]: [None, 104, 2] 

[layer]: LocallyConnected1D	[shape]: [None, 104, 1] 

[layer]: LocallyConnected1D	[shape]: [None, 104, 1] 

[layer]: Flatten	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 1] 

on_epoch_end
h_local1_weight = (104, 2, 1)
h_local2_weight = (104, 1, 1)
h0 = (104, 2)
h0_abs = (104, 2)
h1 = (104, 104)
h2 = (104, 104)
h3 = (104, 1)
W1 = (104, 104)
W2 = (104, 104)
W3 = (104, 1)
on_epoch_end
h_local1_weight = (104, 2, 1)
h_local2_weight = (104, 1, 1)
h0 = (104, 2)
h0_abs = (104, 2)
h1 = (104, 104)
h2 = (104, 104)
h3 = (104, 1)
W1 = (104, 104)
W2 = (104, 104)
W3 = (104, 1)
on_epoch_end
h_local1_weight = (104, 2, 1)
h_local2_weight = (104, 1, 1)
h0 = (104, 2)
h0_abs = (104, 2)
h1 = (104, 104)
h2 = (104, 104)
h3 = (104, 1)
W1 = (104, 104)
W2 = (104, 104)
W3 = (104, 1)
on_epoch_end
h_local1_weight = (104, 2, 1)
h_local2_weig

__init__
208
Run 2: 45 features selected
Saved selected features to: ./dnn_featout_pitt/run2/selected_features_run2.csv
__init__parameters
[layer]: Input	[shape]: [None, 45, 2] 

[layer]: LocallyConnected1D	[shape]: [None, 45, 1] 

[layer]: LocallyConnected1D	[shape]: [None, 45, 1] 

[layer]: Flatten	[shape]: [None, 45] 

[layer]: Dense	[shape]: [None, 45] 

[layer]: Dense	[shape]: [None, 45] 

[layer]: Dense	[shape]: [None, 1] 

Run 2: AUC = 0.6612

=== Run 3/10 with seed 1002 ===
__init__parameters
[layer]: Input	[shape]: [None, 104, 2] 

[layer]: LocallyConnected1D	[shape]: [None, 104, 1] 

[layer]: LocallyConnected1D	[shape]: [None, 104, 1] 

[layer]: Flatten	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 1] 

on_epoch_end
h_local1_weight = (104, 2, 1)
h_local2_weight = (104, 1, 1)
h0 = (104, 2)
h0_abs = (104, 2)
h1 = (104, 104)
h2 = (104, 104)
h3 = (104, 1)
W1 = (104, 104)
W2 = (104, 104)
W3 = (104,

__init__
208
Run 4: 49 features selected
Saved selected features to: ./dnn_featout_pitt/run4/selected_features_run4.csv
__init__parameters
[layer]: Input	[shape]: [None, 49, 2] 

[layer]: LocallyConnected1D	[shape]: [None, 49, 1] 

[layer]: LocallyConnected1D	[shape]: [None, 49, 1] 

[layer]: Flatten	[shape]: [None, 49] 

[layer]: Dense	[shape]: [None, 49] 

[layer]: Dense	[shape]: [None, 49] 

[layer]: Dense	[shape]: [None, 1] 

Run 4: AUC = 0.7125

=== Run 5/10 with seed 1004 ===
__init__parameters
[layer]: Input	[shape]: [None, 104, 2] 

[layer]: LocallyConnected1D	[shape]: [None, 104, 1] 

[layer]: LocallyConnected1D	[shape]: [None, 104, 1] 

[layer]: Flatten	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 1] 

on_epoch_end
h_local1_weight = (104, 2, 1)
h_local2_weight = (104, 1, 1)
h0 = (104, 2)
h0_abs = (104, 2)
h1 = (104, 104)
h2 = (104, 104)
h3 = (104, 1)
W1 = (104, 104)
W2 = (104, 104)
W3 = (104,

__init__
208
Run 6: 52 features selected
Saved selected features to: ./dnn_featout_pitt/run6/selected_features_run6.csv
__init__parameters
[layer]: Input	[shape]: [None, 52, 2] 

[layer]: LocallyConnected1D	[shape]: [None, 52, 1] 

[layer]: LocallyConnected1D	[shape]: [None, 52, 1] 

[layer]: Flatten	[shape]: [None, 52] 

[layer]: Dense	[shape]: [None, 52] 

[layer]: Dense	[shape]: [None, 52] 

[layer]: Dense	[shape]: [None, 1] 

Run 6: AUC = 0.6227

=== Run 7/10 with seed 1006 ===
__init__parameters
[layer]: Input	[shape]: [None, 104, 2] 

[layer]: LocallyConnected1D	[shape]: [None, 104, 1] 

[layer]: LocallyConnected1D	[shape]: [None, 104, 1] 

[layer]: Flatten	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 1] 

on_epoch_end
h_local1_weight = (104, 2, 1)
h_local2_weight = (104, 1, 1)
h0 = (104, 2)
h0_abs = (104, 2)
h1 = (104, 104)
h2 = (104, 104)
h3 = (104, 1)
W1 = (104, 104)
W2 = (104, 104)
W3 = (104,

__init__
208
Run 8: 51 features selected
Saved selected features to: ./dnn_featout_pitt/run8/selected_features_run8.csv
__init__parameters
[layer]: Input	[shape]: [None, 51, 2] 

[layer]: LocallyConnected1D	[shape]: [None, 51, 1] 

[layer]: LocallyConnected1D	[shape]: [None, 51, 1] 

[layer]: Flatten	[shape]: [None, 51] 

[layer]: Dense	[shape]: [None, 51] 

[layer]: Dense	[shape]: [None, 51] 

[layer]: Dense	[shape]: [None, 1] 

Run 8: AUC = 0.6447

=== Run 9/10 with seed 1008 ===
__init__parameters
[layer]: Input	[shape]: [None, 104, 2] 

[layer]: LocallyConnected1D	[shape]: [None, 104, 1] 

[layer]: LocallyConnected1D	[shape]: [None, 104, 1] 

[layer]: Flatten	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 104] 

[layer]: Dense	[shape]: [None, 1] 

on_epoch_end
h_local1_weight = (104, 2, 1)
h_local2_weight = (104, 1, 1)
h0 = (104, 2)
h0_abs = (104, 2)
h1 = (104, 104)
h2 = (104, 104)
h3 = (104, 1)
W1 = (104, 104)
W2 = (104, 104)
W3 = (104,

__init__
208
Run 10: 57 features selected
Saved selected features to: ./dnn_featout_pitt/run10/selected_features_run10.csv
__init__parameters
[layer]: Input	[shape]: [None, 57, 2] 

[layer]: LocallyConnected1D	[shape]: [None, 57, 1] 

[layer]: LocallyConnected1D	[shape]: [None, 57, 1] 

[layer]: Flatten	[shape]: [None, 57] 

[layer]: Dense	[shape]: [None, 57] 

[layer]: Dense	[shape]: [None, 57] 

[layer]: Dense	[shape]: [None, 1] 

Run 10: AUC = 0.6575


In [6]:
# ----------------------------
# Results summary
# ----------------------------
print("\nAll AUCs:", aucs)
print("Selected counts:", n_selected_list)

valid_aucs = [a for a in aucs if not np.isnan(a)]
if valid_aucs:
    print(f"Mean AUC over {len(valid_aucs)} runs: {np.mean(valid_aucs):.4f} ± {np.std(valid_aucs):.4f}")
    print(f"Average number of features selected: {np.mean([n for n in n_selected_list if n>0]):.1f}")
else:
    print("No valid AUCs (all runs selected 0 features)")


All AUCs: [0.7252747252747253, 0.6611721611721612, 0.6813186813186812, 0.7124542124542125, 0.7197802197802198, 0.6227106227106227, 0.7307692307692307, 0.6446886446886446, 0.6318681318681318, 0.6575091575091575]
Selected counts: [54, 45, 58, 49, 50, 52, 48, 51, 54, 57]
Mean AUC over 10 runs: 0.6788 ± 0.0387
Average number of features selected: 51.8
