In [3]:
import pandas as pd

# Try loading the file with tab separator, and let pandas infer the header
mutation_raw = pd.read_csv("mutations", sep="\t", engine="python")

# Preview the columns and a few rows to verify successful parsing
print("Columns in mutation file:")
print(mutation_raw.columns)

print("\nPreview of mutation data:")
print(mutation_raw.head())

Columns in mutation file:
Index(['sample', 'chr', 'start', 'end', 'reference', 'alt', 'gene', 'effect',
       'Amino_Acid_Change', 'DNA_VAF', 'SIFT', 'PolyPhen'],
      dtype='object')

Preview of mutation data:
            sample chr      start        end reference alt      gene  \
0  TCGA-2A-A8VL-01  10   29775061   29775061         C   T      SVIL   
1  TCGA-2A-A8VL-01  11  105948438  105948438         A   C  AASDHPPT   
2  TCGA-2A-A8VL-01  11   18460141   18460141         C   T      LDHC   
3  TCGA-2A-A8VL-01  11   55606971   55606971         T   A    OR5D16   
4  TCGA-2A-A8VL-01  11    6541310    6541310         C   T     DNHD1   

                   effect Amino_Acid_Change  DNA_VAF  \
0       Missense_Mutation          p.D1578N     0.15   
1  Translation_Start_Site             p.M1?     0.16   
2       Missense_Mutation           p.T220M     0.28   
3                  Silent           p.T248T     0.15   
4       Missense_Mutation           p.S588F     0.26   

                 

In [4]:
# Step 1: Subset the relevant columns
mutation_subset = mutation_raw[["sample", "gene"]].copy()

# Step 2: Drop any rows with missing gene/sample values
mutation_subset.dropna(inplace=True)

# Step 3: Drop duplicate mutations (e.g., same gene mutated multiple times in one sample)
mutation_subset = mutation_subset.drop_duplicates()

# Step 4: Add a column to indicate a mutation is present
mutation_subset["mutated"] = 1

# Step 5: Pivot to create binary matrix: rows = samples, columns = genes
mutation_matrix = mutation_subset.pivot_table(
    index="sample",
    columns="gene",
    values="mutated",
    fill_value=0
)

# Step 6: Check results
print("Binary mutation matrix shape:", mutation_matrix.shape)
print(mutation_matrix.iloc[:5, :5])

Binary mutation matrix shape: (495, 12677)
gene             A1BG  A1CF  A2M  A2ML1  A4GALT
sample                                         
TCGA-2A-A8VL-01   0.0   0.0  0.0    0.0     0.0
TCGA-2A-A8VO-01   0.0   0.0  0.0    0.0     0.0
TCGA-2A-A8VT-01   0.0   0.0  0.0    0.0     0.0
TCGA-2A-A8VV-01   0.0   0.0  0.0    0.0     0.0
TCGA-2A-A8VX-01   0.0   0.0  0.0    0.0     0.0


In [7]:
# Load phenotype data
phenotype_df = pd.read_csv("phenotype", sep="\t")

# Preview all column names to help locate the response label
print("Phenotype columns:\n")
print(phenotype_df.columns.tolist())

# Optional: preview first few rows to see values
print("\nPhenotype preview:\n")
print(phenotype_df.head())


Phenotype columns:

['sampleID', '_INTEGRATION', '_PATIENT', '_cohort', '_primary_disease', '_primary_site', 'additional_pharmaceutical_therapy', 'additional_radiation_therapy', 'age_at_initial_pathologic_diagnosis', 'bcr_followup_barcode', 'bcr_patient_barcode', 'bcr_sample_barcode', 'biochemical_recurrence', 'bone_scan_results', 'clinical_M', 'clinical_T', 'days_to_birth', 'days_to_bone_scan_performed', 'days_to_collection', 'days_to_death', 'days_to_diagnostic_computed_tomography_performed', 'days_to_diagnostic_mri_performed', 'days_to_first_biochemical_recurrence', 'days_to_initial_pathologic_diagnosis', 'days_to_last_followup', 'days_to_new_tumor_event_after_initial_treatment', 'days_to_psa', 'days_to_second_biochemical_recurrence', 'days_to_third_biochemical_recurrence', 'diagnostic_ct_abd_pelvis_performed', 'diagnostic_ct_abd_pelvis_result', 'diagnostic_mri_performed', 'diagnostic_mri_result', 'followup_case_report_form_submission_reason', 'followup_treatment_success', 'form_com

In [8]:
# Check unique values of the suspected response column
response_col = "primary_therapy_outcome_success"

# Preview value counts
print(phenotype_df[response_col].value_counts(dropna=False))

primary_therapy_outcome_success
Complete Remission/Response    396
NaN                             68
Partial Remission/Response      41
Progressive Disease             33
Stable Disease                  27
[Discrepancy]                    1
Name: count, dtype: int64


define responder = 1 as complete remission/response, partial remission/response
non responder = 0 as progressive disease, stable disease
dropNaN and discrepancy

In [12]:
# Step 1: Copy only the relevant columns
response_df = phenotype_df[["sampleID", "primary_therapy_outcome_success"]].copy()
response_df.columns = ["sample", "outcome"]

# Step 2: Clean sample IDs to match mutation matrix format
# Your mutation matrix uses IDs like "TCGA-2A-A8VL"
response_df["sample"] = response_df["sample"].str.replace("-01", "", regex=False)
mutation_matrix.index = mutation_matrix.index.str.replace("-01", "", regex=False)

# Step 3: Map outcome values to binary labels
response_map = {
    "Complete Remission/Response": 1,
    "Partial Remission/Response": 1,
    "Progressive Disease": 0,
    "Stable Disease": 0
}
response_df["response"] = response_df["outcome"].map(response_map)

# Step 4: Drop samples with no valid response
response_df = response_df.dropna(subset=["response"])

# Step 5: Merge binary mutation matrix with labels
mutation_with_response = mutation_matrix.merge(response_df[["sample", "response"]], 
                                                left_index=True, right_on="sample")

# Final sanity check
print("Final dataset shape (samples x genes+1):", mutation_with_response.shape)
print("Label counts:")
print(mutation_with_response["response"].value_counts())


Final dataset shape (samples x genes+1): (432, 12679)
Label counts:
response
1.0    375
0.0     57
Name: count, dtype: int64


RUN FISHERS TEST FOR ALL GENES

In [14]:
from scipy.stats import fisher_exact
# Step 1: separate response from feature matrix
response = mutation_with_response["response"]
X_mut = mutation_with_response.drop(columns=["sample", "response"])

# Step 2: run Fisher's test per gene
p_values = {}
for gene in X_mut.columns:
    gene_values = X_mut[gene]

    # Build contingency table
    #            Responder  Non-responder
    # Mutated         a            b
    # Not Mutated     c            d
    a = ((gene_values == 1) & (response == 1)).sum()
    b = ((gene_values == 1) & (response == 0)).sum()
    c = ((gene_values == 0) & (response == 1)).sum()
    d = ((gene_values == 0) & (response == 0)).sum()
    
    table = [[a, b], [c, d]]
    
    # Fisher’s exact test (right-tailed test by default)
    _, p = fisher_exact(table)
    p_values[gene] = p

# Step 3: convert to DataFrame and filter
fisher_results = pd.DataFrame.from_dict(p_values, orient='index', columns=["p_value"])
fisher_results.sort_values("p_value", inplace=True)

# Optional: apply p-value cutoff
significant_genes = fisher_results[fisher_results["p_value"] < 0.05].index.tolist()

# Step 4: subset mutation matrix to only significant genes
X_sig_mut = X_mut[significant_genes]

print("Total genes tested:", len(X_mut.columns))
print("Significant genes (p < 0.05):", len(significant_genes))
print("Shape of filtered mutation matrix:", X_sig_mut.shape)


Total genes tested: 12677
Significant genes (p < 0.05): 77
Shape of filtered mutation matrix: (432, 77)


DEEP LEARNING MODEL:

In [15]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Step 1: Define features and labels
X = X_sig_mut.values
y = mutation_with_response["response"].values

# Step 2: Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Step 3: Scale inputs (optional for binary, but helps optimization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

# Step 5: Train the model
history = model.fit(X_train_scaled, y_train, epochs=50, batch_size=16, validation_split=0.2, verbose=1)

# Step 6: Evaluate on test set
y_pred_prob = model.predict(X_test_scaled).flatten()
y_pred = (y_pred_prob >= 0.5).astype(int)

# Step 7: Metrics
auc = roc_auc_score(y_test, y_pred_prob)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"\nTest AUC: {auc:.3f}")
print("\nConfusion Matrix:")
print(cm)
print("\nClassification Report:")
print(report)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8223 - auc: 0.3849 - loss: 0.6507 - val_accuracy: 0.8406 - val_auc: 0.9373 - val_loss: 0.5495
Epoch 2/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8921 - auc: 0.8060 - loss: 0.4509 - val_accuracy: 0.8986 - val_auc: 0.9559 - val_loss: 0.4810
Epoch 3/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9102 - auc: 0.7486 - loss: 0.3859 - val_accuracy: 0.8986 - val_auc: 0.9644 - val_loss: 0.4294
Epoch 4/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9045 - auc: 0.7135 - loss: 0.3503 - val_accuracy: 0.8986 - val_auc: 0.9653 - val_loss: 0.4088
Epoch 5/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9259 - auc: 0.9243 - loss: 0.2468 - val_accuracy: 0.9130 - val_auc: 0.9737 - val_loss: 0.4060
Epoch 6/50
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━