In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr

import warnings
warnings.filterwarnings('ignore')

print("Libraries loaded successfully!")

Libraries loaded successfully!


## 1. Load Data

In [None]:
data_path = r"c:\Users\ozeir\vsrepo\ai-brain-tumor\data\raw"

gene_expr = pd.read_csv(f"{data_path}/gene_expression.csv", index_col=0)

# Load metadata, then set sample_id as index
metadata = pd.read_csv(f"{data_path}/metadata.csv")
metadata = metadata.set_index("sample_id")

print("Gene expression shape (samples × genes):", gene_expr.shape)
print("Metadata shape:", metadata.shape)

# Align samples between gene_expr and metadata
common_ids = gene_expr.index.intersection(metadata.index)
print(f"\nCommon samples: {len(common_ids)}")

gene_expr = gene_expr.loc[common_ids].sort_index()
metadata  = metadata.loc[common_ids].sort_index()

print("After alignment:")
print("  gene_expr:", gene_expr.shape)
print("  metadata :", metadata.shape)

print("\nClass distribution:")
print(metadata["label"].value_counts())

print("\nBatch distribution:")
print(metadata["batch"].value_counts())


Gene expression shape (samples × genes): (18635, 18858)
Metadata shape: (18635, 2)

Common samples: 18635
After alignment:
  gene_expr: (18635, 18858)
  metadata : (18635, 2)

Class distribution:
label
1    17382
0     1253
Name: count, dtype: int64

Batch distribution:
batch
healthy_batch    17382
tumor_batch       1253
Name: count, dtype: int64
After alignment:
  gene_expr: (18635, 18858)
  metadata : (18635, 2)

Class distribution:
label
1    17382
0     1253
Name: count, dtype: int64

Batch distribution:
batch
healthy_batch    17382
tumor_batch       1253
Name: count, dtype: int64


In [22]:
print("gene_expr.index:")
print(gene_expr.index[:10])

print("\nmetadata.index:")
print(metadata.index[:10])


gene_expr.index:
Index(['000f90b3-7383-4887-af84-73f231c03f39.rna_seq.augmented_star_gene_counts',
       '003f7220-d3c0-4b15-8cfb-38f20668c80b.rna_seq.augmented_star_gene_counts',
       '0045dd59-166c-426d-ac49-501bbb129d43.rna_seq.augmented_star_gene_counts',
       '00564877-4cd9-4637-8dea-7b0a17b37857.rna_seq.augmented_star_gene_counts',
       '005b79ce-22fd-4575-9271-a41bfebc94d3.rna_seq.augmented_star_gene_counts',
       '006b091b-3a7a-4a26-9eb6-d7797874ad9b.rna_seq.augmented_star_gene_counts',
       '008e5b39-aa42-45e9-98bf-6a180cde10bc.rna_seq.augmented_star_gene_counts',
       '00f3dc32-cdd0-4ded-8ff8-f7809cfe3883.rna_seq.augmented_star_gene_counts',
       '013c0ba5-0d5e-467a-b9cc-4e19f75e21d6.rna_seq.augmented_star_gene_counts',
       '014c0152-04c3-4370-9fc6-e42bdc7f3f79.rna_seq.augmented_star_gene_counts'],
      dtype='object', name='sample_id')

metadata.index:
Index(['000f90b3-7383-4887-af84-73f231c03f39.rna_seq.augmented_star_gene_counts',
       '003f7220-d3c0-4

## 2. Prepare Batch and Covariate Information

In [None]:
import pandas as pd
import numpy as np
from neuroCombat import neuroCombat
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# 1. LOAD DATA (gene_expr = samples × genes)
print("Loading data...")

print("Gene expression shape (samples × genes):", gene_expr.shape)
print("Metadata shape:", metadata.shape)

# Ensure indices match
assert len(gene_expr) == len(metadata), "Metadata and expression row count differ!"

# Align indices if needed
gene_expr = gene_expr.loc[metadata.index]

# Extract labels
y = metadata["label"].values      # 0/1 tumor vs healthy
batch = metadata["batch"].values  # TCGA / GTEx

# 2. COMBAT BATCH CORRECTION (no biological protection)
print("\nRunning ComBat (NO label protection)...")

combat_input = gene_expr.T      # (genes × samples)

covars = pd.DataFrame({
    "batch": batch
})

combat_output = neuroCombat(
    dat=combat_input.values,   
    covars=covars,
    batch_col='batch',
    parametric=True
)["data"]

# Back to sklearn format: samples × genes
X_corrected = combat_output.T

print("Post-ComBat expression shape (samples × genes):", X_corrected.shape)

# 3. TRAIN/TEST SPLIT
print("\nSplitting train/test...")

X_train, X_test, y_train, y_test = train_test_split(
    X_corrected,
    y,
    stratify=y,
    shuffle=True,
    test_size=0.2,
    random_state=42
)

print("Train:", X_train.shape, " Test:", X_test.shape)

# 4. CV (feature selection + scaling inside fold)
print("\nRunning cross-validation...")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []

for tr_idx, val_idx in cv.split(X_train, y_train):
    
    X_tr, X_val = X_train[tr_idx], X_train[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    # Feature selection (training only)
    selector = SelectKBest(f_classif, k=200)
    X_tr_sel = selector.fit_transform(X_tr, y_tr)
    X_val_sel = selector.transform(X_val)

    # Scaling (training only)
    scaler = StandardScaler()
    X_tr_scal = scaler.fit_transform(X_tr_sel)
    X_val_scal = scaler.transform(X_val_sel)

    # Model
    model = RandomForestClassifier(
        n_estimators=200,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_tr_scal, y_tr)

    preds = model.predict(X_val_scal)
    fold_scores.append(accuracy_score(y_val, preds))

print("\nCV accuracy per fold:", fold_scores)
print("Mean CV accuracy:", np.mean(fold_scores))

# 5. FINAL MODEL ON FULL TRAIN SET
print("\nTraining final model...")

selector_final = SelectKBest(f_classif, k=200)
X_train_sel = selector_final.fit_transform(X_train, y_train)
X_test_sel  = selector_final.transform(X_test)

scaler_final = StandardScaler()
X_train_scal = scaler_final.fit_transform(X_train_sel)
X_test_scal  = scaler_final.transform(X_test_sel)

final_model = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

final_model.fit(X_train_scal, y_train)

# 6. TEST SET EVALUATION
y_pred = final_model.predict(X_test_scal)
y_proba = final_model.predict_proba(X_test_scal)[:, 1]

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)

print("\nFINAL TEST PERFORMANCE")
print("Accuracy:", acc)
print("ROC-AUC:", auc)


Loading data...
Gene expression shape (samples × genes): (18635, 18858)
Metadata shape: (18635, 2)

Running ComBat (NO label protection)...
[neuroCombat] Creating design matrix
[neuroCombat] Standardizing data across features

Running ComBat (NO label protection)...
[neuroCombat] Creating design matrix
[neuroCombat] Standardizing data across features
[neuroCombat] Fitting L/S model and finding priors
[neuroCombat] Fitting L/S model and finding priors
[neuroCombat] Finding parametric adjustments
[neuroCombat] Finding parametric adjustments
[neuroCombat] Final adjustment of data
[neuroCombat] Final adjustment of data
Post-ComBat expression shape (samples × genes): (18635, 18858)

Splitting train/test...
Post-ComBat expression shape (samples × genes): (18635, 18858)

Splitting train/test...
Train: (14908, 18858)  Test: (3727, 18858)

Running leak-free cross-validation...
Train: (14908, 18858)  Test: (3727, 18858)

Running leak-free cross-validation...

CV accuracy per fold: [1.0, 1.0, 1.0