In [417]:
from typing import Any, Dict, Iterable, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf

# Generate data

In [418]:
N_COVAR = 128

In [419]:
def gen_data(
  n: int,
  base_rate=0.50,
  cens_prop=0.20,
  n_covar=N_COVAR,
) -> Dict[str, np.ndarray]:
  """Generate data.
  
  Args:
    n: Sample size.
    base_rate: Base event rate.
    cens_prop: Expected censoring proportion.
    n_covar: Number of covariates.
    n_freq: Number of frequencies.
  
  """

  # Covariates.
  x = np.random.rand(n, n_covar)
  
  # Linear predictor.
  coef = np.random.randn(n_covar)
  eta = np.dot(x, coef)
  eta = (eta - np.mean(eta)) / np.std(eta)

  # Time-to-event.
  event_rate = base_rate * np.exp(eta)
  event_time = np.random.exponential(scale=1/event_rate, size=len(event_rate))

  cens_rate = cens_prop / (1 - cens_prop) * event_rate 
  cens_time = np.random.exponential(scale=1/cens_rate, size=len(cens_rate))

  status = (event_time <= cens_time)
  time = np.where(status, event_time, cens_time)

  # Target matrix.
  y = np.stack((status, time), axis=1)

  # Output.
  return {
    "x": x,
    "risk": event_rate,
    "status": status,
    "time": time,
    "y": y,
  }

In [420]:
def split_data(
  data: Dict[str, np.ndarray],
  train_prop: float = 0.6,
  val_prop: float = 0.2,
) -> Dict[str, np.ndarray]:

  n = len(data["time"])
  test_prop = 1 - (train_prop + val_prop)
  assert test_prop >= 0

  n_train = int(n * train_prop)
  n_val = int(n * val_prop)
  n_test = int(n * test_prop)

  out = {}
  for key in data.keys():
    out[f"train_{key}"] = data[key][:n_train]
    out[f"val_{key}"] = data[key][n_train:(n_train + n_val)]
    out[f"test_{key}"] = data[key][(n_train + n_val):]
  
  return out

## Overall

In [424]:
class PrepData:

  def __init__(
    self,
    n: int,
    base_rate=0.50,
    batch_size=128,
    cens_prop=0.20,
    n_covar=N_COVAR,
    train_prop=0.6,
    val_prop=0.2,
  ) -> None:
    self.data = gen_data(n, base_rate, cens_prop, n_covar)
    self.split_data = split_data(self.data, train_prop, val_prop)
    self.n = n
    self.base_rate = base_rate
    self.cens_prop = cens_prop
    self.n_covar = n_covar
  
  def get_orig_data(self) -> Dict[str, np.ndarray]:
    return self.data
  
  def get_split_data(self) -> Dict[str, np.ndarray]:
    return self.split_data

In [425]:
class DataGenerator:

  def __init__(
    self,
    x: np.ndarray,
    y: np.ndarray,
    batch_size=128,
  ):
    self.n = x.shape[0]
    self.x = x
    self.y = y  
    self.batch_size = batch_size
    self.n_covar = x.shape[1]
    self.steps_per_epoch = self.n // batch_size
   
  def get_batch(
      self, index: np.ndarray) -> Tuple[np.ndarray, Tuple[np.ndarray]]:
      x = self.x[index]
      y = self.y[index]
      return x, y
  
  def generator(self) -> Iterable[Tuple[np.ndarray, Tuple[np.ndarray]]]:
    index = np.arange(self.n)
    for b in range(self.steps_per_epoch):
      start = b * self.batch_size
      idx = index[start:(start + self.batch_size)]
      yield self.get_batch(idx)
  
  def make_dataset(self) -> tf.data.Dataset:
    """Create dataset from generator."""
    ds = tf.data.Dataset.from_generator(
      self.generator,
      output_signature=(
        tf.TensorSpec(shape=(self.batch_size, self.n_covar), dtype=tf.float32),
        tf.TensorSpec(shape=(self.batch_size, 2), dtype=tf.float32)
      )
    )
    return ds
  
  def __call__(self) -> tf.data.Dataset:
    return self.make_dataset()

In [448]:
def prep_datasets(
    split_data: Dict[str, np.ndarray],
    batch_size=128,
  ) -> Dict[str, tf.data.Dataset]:
  sets = ["train", "val", "test"]
  out = {}
  for key in sets:
    x = split_data[f"{key}_x"]
    y = split_data[f"{key}_y"]
    data_fn = DataGenerator(x, y, batch_size)
    ds = data_fn()
    out[f"{key}"] = ds
  return out

# Kaplan-Meier

## Build masks

In [7]:
def build_masks(status: np.ndarray, time: np.ndarray) -> Tuple[np.ndarray]:
  """Build masks.

  Create unique-time (row) by subject (col) boolean masks.

  Returns:
    at_risk: [t, i] = True if subject i is at risk at time t.
    cens: [t, i] = True if subject is is censored at time t.
    event: [t, i] = True if subject i has an event at time t.
    unique_times: Unique times corresponding to the rows.

  """
  n_subj = len(time)

  # Add 0 if not present.
  unique_times = np.sort(np.unique(time))
  if not (0 in unique_times):
    unique_times = np.insert(unique_times, 0, 0)
  n_unique_time = len(unique_times)

  # Masks.
  at_risk = np.zeros(shape=(n_unique_time, n_subj), dtype=bool)
  cens = np.zeros(shape=(n_unique_time, n_subj), dtype=bool)
  event = np.zeros(shape=(n_unique_time, n_subj), dtype=bool)

  for i in range(n_subj):
    at_risk[:, i] = (time[i] >= unique_times)
    which_time = (unique_times == time[i])
    cens[:, i] = (not status[i]) * which_time
    event[:, i] = status[i] * which_time

  return at_risk, cens, event, unique_times

In [8]:
n = 5
status = np.array(np.round(np.random.rand(n)), dtype=bool)
time = np.arange(n, dtype=float)

In [9]:
print("Status:")
print(status)
print("\n")
print("Time:")
print(time)

Status:
[ True  True  True False  True]


Time:
[0. 1. 2. 3. 4.]


In [10]:
at_risk, cens, event, unique_times = build_masks(status, time)

In [11]:
print("At risk:")
print(at_risk)

At risk:
[[ True  True  True  True  True]
 [False  True  True  True  True]
 [False False  True  True  True]
 [False False False  True  True]
 [False False False False  True]]


In [12]:
print("Censored:")
print(cens)

Censored:
[[False False False False False]
 [False False False False False]
 [False False False False False]
 [False False False  True False]
 [False False False False False]]


In [13]:
print("Event:")
print(event)

Event:
[[ True False False False False]
 [False  True False False False]
 [False False  True False False]
 [False False False False False]
 [False False False False  True]]


## Tabulate KM

In [14]:
def tab_km(status: np.ndarray, time: np.ndarray) -> pd.DataFrame:
  """Taublate Kaplan-Meier."""
  at_risk, cens, event, unique_times = build_masks(status, time)

  out = pd.DataFrame({
    "time": unique_times,
    "n_at_risk": np.sum(at_risk, axis=1),
    "n_event": np.sum(event, axis=1),
    "n_cens": np.sum(cens, axis=1),
  })
  out["haz"] = out.n_event / out.n_at_risk
  out["surv"] = np.cumprod(1 - out.haz)
  return out  

## Estimator

In [15]:
class KaplanMeier:

  def __init__(self, status: np.ndarray, time: np.ndarray) -> None:
    self.status = status
    self.time = time
    self.km = tab_km(self.status, self.time)
  
  def return_table(self) -> pd.DataFrame:
    return self.km
    
  def __call__(self, x: float) -> float:
    km = self.km
    return km.surv[np.max(np.where(km.time <= x))]

In [16]:
# Fit Kaplan-Meier.
km = KaplanMeier(
  status = np.array([True, False, True, False]),
  time = np.arange(1, 5)
)
km.return_table()

Unnamed: 0,time,n_at_risk,n_event,n_cens,haz,surv
0,0,4,0,0,0.0,1.0
1,1,4,1,0,0.25,0.75
2,2,3,0,1,0.0,0.75
3,3,2,1,0,0.5,0.375
4,4,1,0,1,0.0,0.375


In [17]:
# Evaluate Kaplan-Meier.
km(1.1)

0.75

# C-statistic

* Reference: [On the C-statistics for Evaluating Overall Adequacy of Risk Prediction Procedures with Censored Survival Data](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3079915/).

In [18]:
class Concord:
  """Calculate concordance.
  
  Note that the Kaplan-Meier curve of the censoring distribution may be fit
  using different data from that used to calculate the C-statistic.
  
  """

  def __init__(self, status: np.ndarray, time: np.ndarray) -> None:
    self.km = KaplanMeier(~status, time)
  
  def __call__(
    self,
    risk: np.ndarray,
    status: np.ndarray,
    time: np.ndarray,
    tau=None
  ) -> float:

    n = len(risk)
    upper = 0
    lower = 0

    if not tau:
      tau = np.max(time)

    for i in range(n):
      di, ti, ri = status[i], time[i], risk[i]

      # Only cases contribute.
      if not di:
        continue

      for j in range(n):
        tj, rj = time[j], risk[j]
        denom = di * (ti < tj) * (ti < tau)

        # Only calculate censoring weights if denom is non-zero.
        if denom:
          p_cens = np.squeeze(self.km(ti))
          weight = 1 / (p_cens ** 2)
          upper += denom * weight * (ri > rj)
          lower += denom * weight

    return upper / lower if lower > 0 else 0.5

# Proportional hazards loss

In [367]:
class CoxLoss(tf.keras.losses.Loss):

  def __init__(self, **kwargs):
    super().__init__(**kwargs) 
  
  def call(
      self,
      y_true: Tuple[tf.Tensor],
      y_pred: tf.Tensor
  ) -> tf.Tensor:
    """Calculate Cox PH Loss.
    
    Args:
      y_true: (status, time).
      y_pred: risk.

    """
    
    # Note: autograph requires unpacking using indices.
    status = tf.cast(y_true[:, 0], dtype=bool)
    time = tf.cast(y_true[:, 1], dtype=tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    # assert status.shape == time.shape
    
    # Matrix where `at_risk[i, j] = True` if subject j is at risk
    # at the event time for subject i. 
    at_risk = tf.map_fn(
      lambda x: (time >= x), time, fn_output_signature=bool)
    
    risk_score = tf.squeeze(y_pred)
    risk_score_mat = tf.math.multiply(
        tf.ones((time.shape[0], time.shape[0]), dtype=tf.float32), risk_score)
    
    # Only at-risk subjects contribute to the denominator.
    # Note: logsumexp is implemented manually because tf.reduce_logsumexp
    # gives an error when used with graph execution.
    max_score = tf.reduce_max(risk_score)
    risk_score_mat = tf.subtract(risk_score_mat, max_score)
    risk_sets = tf.ragged.boolean_mask(risk_score_mat, at_risk)
    
    set_exp = tf.math.exp(risk_sets)
    set_sum = tf.reduce_sum(set_exp, axis=1)
    denom = tf.add(tf.math.log(set_sum), max_score)

    # The log-likelihood only increments at event times.
    diff = tf.subtract(risk_score, denom)
    return -1 * tf.reduce_mean(tf.ragged.boolean_mask(diff, status))


## Testing

In [252]:
def logsumexp(x: np.ndarray) -> np.ndarray:
  delta = np.max(x)
  y = x - delta
  return delta + np.log(np.sum(np.exp(y)))

In [253]:
if False:
  time = tf.constant(np.array([1, 2, 3]), dtype=tf.float32)
  status = tf.constant(np.array([True, False, True]), dtype=bool)
  risk_score = tf.constant(np.array([3, 2, 1]), dtype=tf.float32)
  at_risk = tf.map_fn(lambda x: (time >= x), time, dtype=bool)

  # Denominator calculation.
  risk_score_mat = tf.math.multiply(
      tf.ones_like(at_risk, dtype=risk_score.dtype), risk_score)
  denom = tf.reduce_logsumexp(
      tf.ragged.boolean_mask(risk_score_mat, at_risk), axis=1)

  exp_denom = np.array([
      logsumexp([3., 2., 1.]),
      logsumexp([2., 1.]),
      logsumexp([1.])
  ])
  assert np.allclose(denom.numpy(), exp_denom)

  # Overall calculation.
  y_true = (status, time)
  y_pred = risk_score
  loss_fn = CoxLoss()
  obs = loss_fn(y_true, y_pred)
  exp = -1 * np.sum(
      status.numpy() * (risk_score.numpy() - exp_denom)) / np.sum(status.numpy())
  assert np.allclose(obs, exp)

# Modeling

## Model architecture

In [433]:
def linear_model() -> tf.keras.Model:
  input = tf.keras.layers.Input(shape=(N_COVAR,), name="input", dtype=tf.float32)
  output = tf.keras.layers.Dense(1, name="output")(input)
  model = tf.keras.Model(input, output, name="model")
  return model

In [434]:
def dropout_model() -> tf.keras.Model:
  input = tf.keras.layers.Input(shape=(N_COVAR,), name="input")
  h = tf.keras.layers.Dense(
    32,
    activation="relu",
    kernel_regularizer=tf.keras.regularizers.L2(),
    name="dense1"
  )(input)
  h = tf.keras.layers.Dropout(0.50, name="drop")(h)
  h = tf.keras.layers.Dense(
    32,
    activation="relu",
    kernel_regularizer=tf.keras.regularizers.L2(),
    name="dense2"
  )(h)
  output = tf.keras.layers.Dense(1, name="output")(h)
  model = tf.keras.Model(input, output, name="model")
  return model

## Data sets

In [435]:
data_fn = PrepData(n=128 * 10)

## Built-in training

### From tensors

In [436]:
# Model.
model = linear_model()

In [437]:
model.compile(
  optimizer=tf.keras.optimizers.Adam(),
  loss=CoxLoss()
)

In [438]:
data = data_fn.get_orig_data()
x = tf.constant(data["x"], dtype=tf.float32)
y = tf.constant(data["y"], dtype=tf.float32)

In [439]:
history = model.fit(
  x=x,
  y=y,
  epochs=2,
)

Epoch 1/2
Epoch 2/2


### From datasets

In [440]:
# Model.
model = linear_model()

In [441]:
model.compile(
  optimizer=tf.keras.optimizers.Adam(),
  loss=CoxLoss()
)

In [442]:
data_generator = DataGenerator(data["x"], data["y"])
ds = data_generator()

In [443]:
# Training dataset.
history = model.fit(
  x=ds,
  epochs=2,
)

Epoch 1/2
Epoch 2/2


In [449]:
# Training and validation datasets.
ds = prep_datasets(data_fn.get_split_data())

In [451]:
# Training and validation datasets.
history = model.fit(
  x=ds["train"],
  epochs=2,
  validation_data=ds["val"]
)

Epoch 1/2
Epoch 2/2


In [452]:
# Evaluate model.
model.evaluate(ds["test"])



3.884826898574829