In [None]:
import pandas as pd
import numpy as np
from neuroCombat import neuroCombat
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# ================================================================
# 1. LOAD DATA (gene_expr = samples × genes)
# ================================================================
print("Loading data...")

print("Gene expression shape (samples × genes):", gene_expr.shape)
print("Metadata shape:", metadata.shape)

# Ensure indices match
assert len(gene_expr) == len(metadata), "Metadata and expression row count differ!"

# Align indices if needed
gene_expr = gene_expr.loc[metadata.index]

# Extract labels
y = metadata["label"].values      # 0/1 tumor vs healthy
batch = metadata["batch"].values  # TCGA / GTEx

# ================================================================
# 2. COMBAT BATCH CORRECTION (no biological protection)
# ================================================================
print("\nRunning ComBat (NO label protection)...")

combat_input = gene_expr.T      # (genes × samples)

covars = pd.DataFrame({
    "batch": batch
})

combat_output = neuroCombat(
    dat=combat_input.values,   
    covars=covars,
    batch_col='batch',
    parametric=True
)["data"]

# Back to sklearn format: samples × genes
X_corrected = combat_output.T

print("Post-ComBat expression shape (samples × genes):", X_corrected.shape)

# ================================================================
# 3. TRAIN/TEST SPLIT
# ================================================================
print("\nSplitting train/test...")

X_train, X_test, y_train, y_test = train_test_split(
    X_corrected,
    y,
    stratify=y,
    shuffle=True,
    test_size=0.2,
    random_state=42
)

print("Train:", X_train.shape, " Test:", X_test.shape)

# ================================================================
# 4. LEAK-FREE CV (feature selection + scaling inside fold)
# ================================================================
print("\nRunning leak-free cross-validation...")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []

for tr_idx, val_idx in cv.split(X_train, y_train):
    
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    # Feature selection (training only)
    selector = SelectKBest(f_classif, k=200)
    X_tr_sel = selector.fit_transform(X_tr, y_tr)
    X_val_sel = selector.transform(X_val)

    # Scaling (training only)
    scaler = StandardScaler()
    X_tr_scal = scaler.fit_transform(X_tr_sel)
    X_val_scal = scaler.transform(X_val_sel)

    # Model
    model = RandomForestClassifier(
        n_estimators=200,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_tr_scal, y_tr)

    preds = model.predict(X_val_scal)
    fold_scores.append(accuracy_score(y_val, preds))

print("\nCV accuracy per fold:", fold_scores)
print("Mean CV accuracy:", np.mean(fold_scores))

# ================================================================
# 5. FINAL MODEL ON FULL TRAIN SET
# ================================================================
print("\nTraining final model...")

selector_final = SelectKBest(f_classif, k=200)
X_train_sel = selector_final.fit_transform(X_train, y_train)
X_test_sel  = selector_final.transform(X_test)

scaler_final = StandardScaler()
X_train_scal = scaler_final.fit_transform(X_train_sel)
X_test_scal  = scaler_final.transform(X_test_sel)

final_model = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

final_model.fit(X_train_scal, y_train)

# ================================================================
# 6. TEST SET EVALUATION
# ================================================================
y_pred = final_model.predict(X_test_scal)
y_proba = final_model.predict_proba(X_test_scal)[:, 1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("\nFINAL TEST PERFORMANCE")
print("=======================")
print("Accuracy:", acc)
print("ROC-AUC:", auc)
