In [None]:
import pandas as pd

# Step 1: Load expression matrix
expression_df = pd.read_csv("expression", sep="\t", index_col=0)

# Step 2: Transpose so rows = samples, columns = genes expression level
expression_df = expression_df.transpose()

# Step 3: Clean sample IDs to match the format used in the response dataframe, removes tissue type at the end
expression_df.index = expression_df.index.str.replace("-01", "", regex=False)

# Step 4: Load response labels again (from earlier phenotype processing)
#same thing as earlier, cleaning phenotype dataset to merge
phenotype_df = pd.read_csv("phenotype", sep="\t")
response_df = phenotype_df[["sampleID", "primary_therapy_outcome_success"]].copy()
response_df.columns = ["sample", "outcome"]
response_df["sample"] = response_df["sample"].str.replace("-01", "", regex=False)

response_map = {
    "Complete Remission/Response": 1,
    "Partial Remission/Response": 1,
    "Progressive Disease": 0,
    "Stable Disease": 0
}
response_df["response"] = response_df["outcome"].map(response_map)
response_df = response_df.dropna(subset=["response"])

# Step 5: Merge expression matrix with response labels
expression_with_response = expression_df.merge(response_df[["sample", "response"]],
                                                left_index=True, right_on="sample")

# Step 6: Separate features and labels
X_expr = expression_with_response.drop(columns=["sample", "response"]) #feature matrix (gene expression)
y_expr = expression_with_response["response"] #label vector, binary 0 or 1

# Sanity check
print("Expression matrix shape (samples x genes):", X_expr.shape)
print("Label distribution:")
print(y_expr.value_counts())

Expression matrix shape (samples x genes): (483, 20530)
Label distribution:
response
1.0    425
0.0     58
Name: count, dtype: int64


T TEST

In [None]:
from scipy.stats import ttest_ind #2 sample ttest used to compare the means of two groups
#helps us find which genes whose average expression is significantly different b/w these two patient groups

# Step 1: Split expression matrix by response group
responder_expr = X_expr[y_expr == 1] #all rows(patients) where response=1
nonresponder_expr = X_expr[y_expr == 0] #where response=0

# Step 2: Perform t-tests across all genes
#loops through all genes running independent ttests
#compares expression levels in responders vs. nonresponders
#returns p value, stored to dictionary mapped to each gene
p_values = {}
for gene in X_expr.columns:
    t_stat, p_val = ttest_ind(responder_expr[gene], nonresponder_expr[gene], equal_var=False)
    p_values[gene] = p_val

# Step 3: Convert to DataFrame and filter
# index = gene, column = p value, sorted most to least significant
ttest_results = pd.DataFrame.from_dict(p_values, orient='index', columns=["p_value"])
ttest_results.sort_values("p_value", inplace=True)

# Optional: apply p-value cutoff
#cutoff at 0.05
significant_genes_expr = ttest_results[ttest_results["p_value"] < 0.05].index.tolist()

# Step 4: Subset the expression matrix to only significant genes
X_sig_expr = X_expr[significant_genes_expr]

# Summary
print("Total genes tested:", len(X_expr.columns))
print("Significant genes (p < 0.05):", len(significant_genes_expr))
print("Shape of filtered expression matrix:", X_sig_expr.shape)

Total genes tested: 20530
Significant genes (p < 0.05): 5072
Shape of filtered expression matrix: (483, 5072)


MODEL TRAINING ON FILTERED EXPRESSION DATA

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Step 1: Prepare data
X = X_sig_expr.values
y = y_expr.values

# Step 2: Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 3: Scale expression values
#standardizes input features, zero mean unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Define the model
#3 layer neural network, layer 1: 128 ReLU units
#layer 2: 64 ReLU units
#output: 1 sigmoid unit(probability of being a responder)
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

# Step 5: Train the model
#50 epochs, model updates every 16 samples
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=16, validation_split=0.2, verbose=1)

# Step 6: Evaluate
y_pred_prob = model.predict(X_test_scaled).flatten()
y_pred = (y_pred_prob >= 0.5).astype(int)

auc = roc_auc_score(y_test, y_pred_prob)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"\nTest AUC: {auc:.3f}")
print("\nConfusion Matrix:")
print(cm)
print("\nClassification Report:")
print(report)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.6669 - auc: 0.5892 - loss: 0.9322 - val_accuracy: 0.8462 - val_auc: 0.5997 - val_loss: 0.9819
Epoch 2/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.8307 - auc: 0.7294 - loss: 0.7291 - val_accuracy: 0.8333 - val_auc: 0.7056 - val_loss: 0.7606
Epoch 3/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9000 - auc: 0.8474 - loss: 0.3769 - val_accuracy: 0.8718 - val_auc: 0.7456 - val_loss: 0.7650
Epoch 4/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9063 - auc: 0.8797 - loss: 0.3181 - val_accuracy: 0.8718 - val_auc: 0.6493 - val_loss: 1.0384
Epoch 5/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9170 - auc: 0.8696 - loss: 0.2905 - val_accuracy: 0.8590 - val_auc: 0.6906 - val_loss: 0.8302
Epoch 6/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━

CUT DOWN SIGNIFICANT GENES FROM 5000 TO MUCH LOWER

In [9]:
top_n = 500  # Or try 1000, 2000
top_genes_by_p = ttest_results.sort_values("p_value").head(top_n).index.tolist()
X_sig_expr_topN = X_expr[top_genes_by_p]
print("Top N significant genes by p-value:", len(X_sig_expr_topN.columns))

Top N significant genes by p-value: 500


RETRAIN ON TOP 500 GENES

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

# Step 1: Define features and labels
X = X_sig_expr_topN.values
y = y_expr.values

# Step 2: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 3: Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Compute class weights
class_weights_array = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights_array))

# Step 5: Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)),
    Dropout(0.4),
    Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

# Step 6: Train the model
history = model.fit(
    X_train_scaled, y_train,
    epochs=50, batch_size=16,
    validation_split=0.2,
    class_weight=class_weights,
    verbose=1
)

# Step 7: Evaluate
y_pred_prob = model.predict(X_test_scaled).flatten()
y_pred = (y_pred_prob >= 0.5).astype(int)

auc = roc_auc_score(y_test, y_pred_prob)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"\nTest AUC: {auc:.3f}")
print("\nConfusion Matrix:")
print(cm)
print("\nClassification Report:")
print(report)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5813 - auc: 0.6501 - loss: 2.2358 - val_accuracy: 0.7564 - val_auc: 0.8745 - val_loss: 1.9329
Epoch 2/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7225 - auc: 0.8095 - loss: 2.0144 - val_accuracy: 0.7564 - val_auc: 0.8745 - val_loss: 1.8069
Epoch 3/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7424 - auc: 0.8383 - loss: 1.8254 - val_accuracy: 0.8333 - val_auc: 0.8772 - val_loss: 1.6819
Epoch 4/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8180 - auc: 0.9036 - loss: 1.7328 - val_accuracy: 0.8205 - val_auc: 0.8908 - val_loss: 1.6094
Epoch 5/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8160 - auc: 0.8673 - loss: 1.6086 - val_accuracy: 0.8333 - val_auc: 0.9138 - val_loss: 1.5241
Epoch 6/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━

The expression-based model trained on t-test-selected genes performed very well overall. Although it started with a high loss and relatively modest AUC in the first few epochs, the model steadily improved and stabilized by epoch 10, reaching a validation AUC over 0.93 and continuing to hold strong through later epochs. On the test set, it achieved an accuracy of 87% and an AUC of 0.95 — strong indicators that it learned meaningful patterns in gene expression. Notably, the model balanced performance across both classes, with a macro F1 score of 0.74 and macro recall of 0.78, showing it didn’t just favor the majority class. This suggests the 500 top-ranked genes from the t-test were genuinely informative, and that the model was able to leverage them effectively despite the high dimensionality of the input.

ask how deep learning and neural networks works, wht epochs are, ReLU, sigmoid, model updates every 16 samples

literature research, check if any published papers already did what I did

expand onto other cancer types, kidney 3 types of kidney cancer, separate than together

scGPT