# Notebook 5: DeepSurv (Deep Cox Proportional Hazards)

## Model Overview

DeepSurv is a neural network for survival analysis:
- **Loss**: Cox partial likelihood (negative log partial likelihood)
- **Architecture**: MLP with dropout and batch normalization
- **Output**: Single risk score (log hazard ratio)

## Configuration
- **Features**: 83 fixed (scaled)
- **Architecture**: [64, 64, 64] with SELU activation
- **Dropout**: 0.51
- **Evaluation**: `concordance_index_ipcw` from sksurv (competition metric)
- **CV Score**: 0.6894 weighted C-index

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import StratifiedKFold
from sksurv.metrics import concordance_index_ipcw
from sksurv.util import Surv
import optuna
from optuna.samplers import TPESampler

TRAIN_PATH = '/your_path/SurvivalPrediction/data'
device = torch.device('mps')

## 1. Load Data

In [10]:
# Load data with IDs and align to target
X_train_full = pd.read_csv(f'{TRAIN_PATH}/X_train_83features_with_id_fixed_scaled.csv')
target = pd.read_csv(f'{TRAIN_PATH}/target_train_clean_aligned.csv')

# Align X_train to target patient order
X_train = X_train_full.set_index('ID').loc[target['ID']].reset_index(drop=True)

y_time = target['OS_YEARS'].values
y_event = target['OS_STATUS'].values.astype(bool)
n_samples = len(X_train)

# Create structured array for sksurv
y_surv = Surv.from_arrays(event=y_event, time=y_time)

print(f"Features: {X_train.shape[1]}")
print(f"Samples: {n_samples}")
print(f"Events: {y_event.sum()} ({y_event.mean()*100:.1f}%)")

Features: 83
Samples: 3120
Events: 1600 (51.3%)


In [11]:
# Load unscaled 83 features for risk groups (must match X_train patient order)
X_train_unscaled_full = pd.read_csv(f'{TRAIN_PATH}/X_train_83features_with_id_fixed.csv')
X_train_unscaled = X_train_unscaled_full.set_index('ID').loc[target['ID']].reset_index(drop=True)

def define_risk_groups(X):
    risk_factors = pd.DataFrame(index=X.index)
    risk_factors['high_blast'] = (X['BM_BLAST'] > 10).astype(int)
    risk_factors['has_TP53'] = (X['has_TP53'] > 0).astype(int)
    risk_factors['low_hb'] = (X['HB'] < 10).astype(int)
    risk_factors['low_plt'] = (X['PLT'] < 50).astype(int)
    risk_factors['high_cyto'] = (X['cyto_risk_score'] >= 3).astype(int)
    n_risk_factors = risk_factors.sum(axis=1)
    return {'test_like': n_risk_factors >= 1, 'high_risk': n_risk_factors >= 2}

risk_groups = define_risk_groups(X_train_unscaled)
has_tp53 = (X_train_unscaled['has_TP53'] > 0).astype(int).values
strat_var = pd.Series([f"{int(e)}_{int(t)}" for e, t in zip(y_event, has_tp53)])

## 2. DeepSurv Model

In [12]:
class DeepSurvNet(nn.Module):
    def __init__(self, in_features, hidden_layers, dropout=0.1, activation='relu'):
        super().__init__()
        layers = []
        prev_size = in_features
        act_fn = nn.ReLU() if activation == 'relu' else nn.SELU()
        for hidden_size in hidden_layers:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.BatchNorm1d(hidden_size))
            layers.append(act_fn)
            layers.append(nn.Dropout(dropout))
            prev_size = hidden_size
        layers.append(nn.Linear(prev_size, 1))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)

def cox_ph_loss(risk_pred, time, event):
    """Cox partial likelihood loss."""
    sorted_indices = torch.argsort(time, descending=True)
    sorted_risk = risk_pred[sorted_indices]
    sorted_event = event[sorted_indices]
    log_risk = torch.logcumsumexp(sorted_risk, dim=0)
    uncensored_likelihood = sorted_risk - log_risk
    censored_likelihood = uncensored_likelihood * sorted_event
    return -torch.sum(censored_likelihood)

## 3. Evaluation Metric

In [13]:
def weighted_cindex_ipcw(risk, y_surv_all, risk_groups, tau=7.0):
    c_overall = concordance_index_ipcw(y_surv_all, y_surv_all, risk, tau=tau)[0]

    mask_test = risk_groups['test_like'].values
    y_surv_test = Surv.from_arrays(event=y_surv_all['event'][mask_test], time=y_surv_all['time'][mask_test])
    c_test = concordance_index_ipcw(y_surv_all, y_surv_test, risk[mask_test], tau=tau)[0]

    mask_high = risk_groups['high_risk'].values
    y_surv_high = Surv.from_arrays(event=y_surv_all['event'][mask_high], time=y_surv_all['time'][mask_high])
    c_high = concordance_index_ipcw(y_surv_all, y_surv_high, risk[mask_high], tau=tau)[0]

    weighted = 0.3 * c_overall + 0.4 * c_test + 0.3 * c_high
    return {'overall': c_overall, 'test_like': c_test, 'high_risk': c_high, 'weighted': weighted}

## 4. Global OOF Evaluation

In [14]:
def global_oof_evaluate(params, n_splits=5, seed=42):
    """Global OOF Cross-validation for DeepSurv."""
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_preds = np.zeros(n_samples)
    
    X_arr = X_train.values
    hidden_layers = params['hidden_layers']
    dropout = params['dropout']
    activation = params['activation']
    lr = params['lr']
    batch_size = params['batch_size']
    epochs = params['epochs']
    
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_arr, strat_var)):
        torch.manual_seed(seed + fold_idx)
        np.random.seed(seed + fold_idx)
        
        X_tr = torch.FloatTensor(X_arr[train_idx]).to(device)
        X_val = torch.FloatTensor(X_arr[val_idx]).to(device)
        time_tr = torch.FloatTensor(y_time[train_idx]).to(device)
        event_tr = torch.FloatTensor(y_event[train_idx].astype(float)).to(device)
        
        model = DeepSurvNet(X_arr.shape[1], hidden_layers, dropout, activation).to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
        
        n_train = len(train_idx)
        for epoch in range(epochs):
            model.train()
            indices = np.random.permutation(n_train)
            for start in range(0, n_train, batch_size):
                end = min(start + batch_size, n_train)
                batch_idx = indices[start:end]
                if len(batch_idx) < 10:
                    continue
                optimizer.zero_grad()
                risk_pred = model(X_tr[batch_idx]).squeeze()
                loss = cox_ph_loss(risk_pred, time_tr[batch_idx], event_tr[batch_idx])
                loss.backward()
                optimizer.step()
        
        model.eval()
        with torch.no_grad():
            oof_preds[val_idx] = model(X_val).squeeze().cpu().numpy()
    
    # Global Z-score normalization
    oof_normalized = (oof_preds - oof_preds.mean()) / (oof_preds.std() + 1e-8)
    return weighted_cindex_ipcw(oof_normalized, y_surv, risk_groups)

## 5. Hyperparameter Tuning with Optuna

We tune the following hyperparameters:
- **Architecture**: Number of layers (2-4) and hidden size (32-128)
- **Dropout**: Regularization strength (0.1-0.7)
- **Activation**: ReLU vs SELU
- **Learning rate**: 1e-5 to 1e-2
- **Batch size**: 32, 64, or 128
- **Epochs**: 20-100

Fixed: `weight_decay=1e-4` (standard L2 regularization)

In [15]:
def objective(trial):
    """Optuna objective for DeepSurv hyperparameter tuning."""
    
    # Architecture
    n_layers = trial.suggest_int('n_layers', 2, 4)
    hidden_size = trial.suggest_int('hidden_size', 32, 128, step=32)
    hidden_layers = [hidden_size] * n_layers
    
    # Regularization
    dropout = trial.suggest_float('dropout', 0.1, 0.7)
    activation = trial.suggest_categorical('activation', ['relu', 'selu'])
    
    # Training
    lr = trial.suggest_float('lr', 1e-5, 1e-2, log=True)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    epochs = trial.suggest_int('epochs', 20, 100)
    
    params = {
        'hidden_layers': hidden_layers,
        'dropout': dropout,
        'activation': activation,
        'lr': lr,
        'batch_size': batch_size,
        'epochs': epochs,
    }
    
    result = global_oof_evaluate(params)
    
    # Store metrics as user attributes
    trial.set_user_attr('overall', result['overall'])
    trial.set_user_attr('test_like', result['test_like'])
    trial.set_user_attr('high_risk', result['high_risk'])
    
    return result['weighted']

print("=" * 70)
print("DEEPSURV HYPERPARAMETER TUNING")
print("=" * 70)
print("\nRunning Optuna optimization (100 trials)...")
print("This will take a while due to neural network training...\n")

study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
    study_name='deepsurv_83features'
)

study.optimize(objective, n_trials=100, show_progress_bar=True)

print("\n" + "=" * 70)
print("BEST TRIAL")
print("=" * 70)
print(f"\nWeighted C-index: {study.best_value:.4f}")
print(f"Overall:   {study.best_trial.user_attrs['overall']:.4f}")
print(f"Test-like: {study.best_trial.user_attrs['test_like']:.4f}")
print(f"High-risk: {study.best_trial.user_attrs['high_risk']:.4f}")

print("\nBest hyperparameters:")
for key, value in study.best_params.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.6f}")
    else:
        print(f"  {key}: {value}")

# Reconstruct hidden_layers for display
n_layers = study.best_params['n_layers']
hidden_size = study.best_params['hidden_size']
print(f"  hidden_layers: {[hidden_size] * n_layers}")

# Top 10 trials
print("\n" + "=" * 70)
print("TOP 10 TRIALS")
print("=" * 70)

trials_df = study.trials_dataframe().sort_values('value', ascending=False).head(10)
print("\n{:<6} | {:>8} | {:>8} | {:>9} | {:>9} | {:>6} | {:>10}".format(
    "Trial", "Weighted", "Overall", "Test-like", "High-risk", "Epochs", "Activation"))
print("-" * 75)

for _, row in trials_df.iterrows():
    print("{:<6} | {:>8.4f} | {:>8.4f} | {:>9.4f} | {:>9.4f} | {:>6} | {:>10}".format(
        int(row['number']),
        row['value'],
        row['user_attrs_overall'],
        row['user_attrs_test_like'],
        row['user_attrs_high_risk'],
        int(row['params_epochs']),
        row['params_activation']))

# Save results
best_params_df = pd.DataFrame([{
    'model': 'DeepSurv_83features',
    'weighted_score': study.best_value,
    'overall': study.best_trial.user_attrs['overall'],
    'test_like': study.best_trial.user_attrs['test_like'],
    'high_risk': study.best_trial.user_attrs['high_risk'],
    **study.best_params
}])
best_params_df.to_csv(f'{TRAIN_PATH}/deepsurv_83features_best_params.csv', index=False)

all_trials = study.trials_dataframe()
all_trials.to_csv(f'{TRAIN_PATH}/deepsurv_83features_trials.csv', index=False)

print(f"\nResults saved:")
print(f"  - deepsurv_83features_best_params.csv")
print(f"  - deepsurv_83features_trials.csv")

DEEPSURV HYPERPARAMETER TUNING

Running Optuna optimization (100 trials)...
This will take a while due to neural network training...



  0%|          | 0/100 [00:00<?, ?it/s]


BEST TRIAL

Weighted C-index: 0.6902
Overall:   0.7175
Test-like: 0.6921
High-risk: 0.6605

Best hyperparameters:
  n_layers: 3
  hidden_size: 64
  dropout: 0.659456
  activation: selu
  lr: 0.000112
  batch_size: 32
  epochs: 58
  hidden_layers: [64, 64, 64]

TOP 10 TRIALS

Trial  | Weighted |  Overall | Test-like | High-risk | Epochs | Activation
---------------------------------------------------------------------------
57     |   0.6902 |   0.7175 |    0.6921 |    0.6605 |     58 |       selu
42     |   0.6896 |   0.7164 |    0.6911 |    0.6607 |     63 |       selu
32     |   0.6893 |   0.7153 |    0.6910 |    0.6612 |     63 |       selu
68     |   0.6890 |   0.7177 |    0.6910 |    0.6574 |     65 |       selu
74     |   0.6888 |   0.7174 |    0.6911 |    0.6573 |     65 |       selu
63     |   0.6888 |   0.7170 |    0.6909 |    0.6578 |     56 |       selu
60     |   0.6887 |   0.7174 |    0.6907 |    0.6573 |     42 |       selu
81     |   0.6886 |   0.7177 |    0.6910 |    0

## 6. Verification: Re-evaluate Best Parameters

In [16]:
# Verify best hyperparameters from tuning
print("Verifying best hyperparameters from Optuna tuning...\n")

# Use best params from study (or hardcoded if study not run)
BEST_PARAMS = {
    'hidden_layers': [study.best_params['hidden_size']] * study.best_params['n_layers'],
    'dropout': study.best_params['dropout'],
    'activation': study.best_params['activation'],
    'lr': study.best_params['lr'],
    'batch_size': study.best_params['batch_size'],
    'epochs': study.best_params['epochs'],
}
print("Using best parameters from Optuna study:")

for k, v in BEST_PARAMS.items():
    print(f"  {k}: {v}")

result = global_oof_evaluate(BEST_PARAMS)
print(f"\nCV Results (Global OOF, competition metric):")
print(f"  Overall C-index:   {result['overall']:.4f}")
print(f"  Test-like C-index: {result['test_like']:.4f}")
print(f"  High-risk C-index: {result['high_risk']:.4f}")
print(f"  Weighted C-index:  {result['weighted']:.4f}")

Verifying best hyperparameters from Optuna tuning...

Using best parameters from Optuna study:
  hidden_layers: [64, 64, 64]
  dropout: 0.6594556222699043
  activation: selu
  lr: 0.00011237410700529054
  batch_size: 32
  epochs: 58

CV Results (Global OOF, competition metric):
  Overall C-index:   0.7175
  Test-like C-index: 0.6921
  High-risk C-index: 0.6605
  Weighted C-index:  0.6902


## 7. Train Final Model on Full Data

In [None]:
# Train on full data
torch.manual_seed(42)
np.random.seed(42)

X_tensor = torch.FloatTensor(X_train.values).to(device)
time_tensor = torch.FloatTensor(y_time).to(device)
event_tensor = torch.FloatTensor(y_event.astype(float)).to(device)

final_model = DeepSurvNet(
    X_train.shape[1],
    BEST_PARAMS['hidden_layers'],
    BEST_PARAMS['dropout'],
    BEST_PARAMS['activation']
).to(device)

optimizer = torch.optim.Adam(final_model.parameters(), lr=BEST_PARAMS['lr'], weight_decay=1e-4)

batch_size = BEST_PARAMS['batch_size']
n_train = len(X_train)

print(f"Training for {BEST_PARAMS['epochs']} epochs...")
for epoch in range(BEST_PARAMS['epochs']):
    final_model.train()
    indices = np.random.permutation(n_train)
    for start in range(0, n_train, batch_size):
        end = min(start + batch_size, n_train)
        batch_idx = indices[start:end]
        if len(batch_idx) < 10:
            continue
        optimizer.zero_grad()
        risk_pred = final_model(X_tensor[batch_idx]).squeeze()
        loss = cox_ph_loss(risk_pred, time_tensor[batch_idx], event_tensor[batch_idx])
        loss.backward()
        optimizer.step()

print("Training complete.")

In [None]:
# Load test data and predict
X_test_full = pd.read_csv(f'{TRAIN_PATH}/X_test_83features_with_id_fixed_scaled.csv')
test_ids = X_test_full['ID'].values
X_test = X_test_full.drop(columns=['ID'])

final_model.eval()
with torch.no_grad():
    X_test_tensor = torch.FloatTensor(X_test.values).to(device)
    test_risk = final_model(X_test_tensor).squeeze().cpu().numpy()

print(f"Test predictions: {len(test_risk)} samples")
print(f"Risk range: [{test_risk.min():.4f}, {test_risk.max():.4f}]")

submission = pd.DataFrame({'ID': test_ids, 'risk_score': test_risk})
submission.to_csv(f'{TRAIN_PATH}/submission_deepsurv_83features.csv', index=False)
print(f"Saved: submission_deepsurv_83features.csv")

## Summary

### DeepSurv Model Results (Competition Metric)

| Metric | Value |
|--------|-------|
| Overall C-index | 0.7175 |
| Test-like C-index | 0.6921 |
| High-risk C-index | 0.6605 |
| **Weighted C-index** | **0.6902** |

### Key Findings
1. Neural network approach provides different error patterns than tree models
2. SELU activation with high dropout (0.51) works best
3. Useful for ensembling with XGBoost AFT
4. Optimal ensemble weight: 25% (combined with 75% XGBoost AFT)