In [1]:
import config as cfg
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import torch
from data_loader import *
from utility.config import load_config

  from .autonotebook import tqdm as notebook_tqdm


In [64]:
data_config = load_config(cfg.DGP_CONFIGS_DIR, f"synthetic_se.yaml")
dl = SingleEventSyntheticDataLoader().load_data(data_config=data_config,
                                                linear=True, copula_name="clayton",
                                                k_tau=0.25, device="cpu", dtype=torch.float64)

n_samples = dl.get_data()[0].shape[0]
n_features = dl.get_data()[0].shape[1]
times = dl.get_data()[1]
events = dl.get_data()[2]
n_censored = int(events.shape[0] - events.sum())
max_t = times.max()

print(f"Number of samples: {n_samples}")
print(f"Number of features: {n_features}")
print(f"Max t: {max_t}")
print(f"Number of censored/uncensored: {n_censored}/{events.shape[0]-n_censored}")
print(f"{round(n_censored / events.shape[0] * 100, 2)}% of records are censored")
print(f"{round((events.shape[0]-n_censored) / events.shape[0] * 100, 2)}% of records are uncensored\n")

Number of samples: 5000
Number of features: 10
Max t: 17.04397297298822
Number of censored/uncensored: 2378/2622
47.56% of records are censored
52.44% of records are uncensored



In [65]:
dl = SeerSingleDataLoader().load_data()
n_samples = dl.get_data()[0].shape[0]
n_features = dl.get_data()[0].shape[1]
times = dl.get_data()[1]
events = dl.get_data()[2]
n_censored = int(events.shape[0] - events.sum())
max_t = times.max()

print(f"Number of samples: {n_samples}")
print(f"Number of features: {n_features}")
print(f"Max t: {max_t}")
print(f"Number of censored/uncensored: {n_censored}/{events.shape[0]-n_censored}")
print(f"{round(n_censored / events.shape[0] * 100, 2)}% of records are censored")
print(f"{round((events.shape[0]-n_censored) / events.shape[0] * 100, 2)}% of records are uncensored\n")

Number of samples: 19246
Number of features: 17
Max t: 121
Number of censored/uncensored: 10492/8754
54.52% of records are censored
45.48% of records are uncensored



In [66]:
dl = SeerCompetingDataLoader().load_data()
n_samples = dl.get_data()[0].shape[0]
n_features = dl.get_data()[0].shape[1]
n_events = dl.n_events

for i in range(n_events+1):
    times = dl.get_data()[1]
    events = (dl.get_data()[2] == i)*1.0
    n_censored = int(events.shape[0] - events.sum())
    max_t = times.max()
    print(f"Number of samples: {n_samples}")
    print(f"Number of features: {n_features}")
    print(f"Max t: {max_t}")
    print(f"Number of censored/uncensored: {n_censored}/{events.shape[0]-n_censored}")
    print(f"{round(n_censored / events.shape[0] * 100, 2)}% of records are censored")
    print(f"{round((events.shape[0]-n_censored) / events.shape[0] * 100, 2)}% of records are uncensored\n")

Number of samples: 19246
Number of features: 17
Max t: 121
Number of censored/uncensored: 11143/8103
57.9% of records are censored
42.1% of records are uncensored

Number of samples: 19246
Number of features: 17
Max t: 121
Number of censored/uncensored: 16857/2389
87.59% of records are censored
12.41% of records are uncensored

Number of samples: 19246
Number of features: 17
Max t: 121
Number of censored/uncensored: 10492/8754
54.52% of records are censored
45.48% of records are uncensored



In [67]:
dl = MimicSingleDataLoader().load_data()
n_samples = dl.get_data()[0].shape[0]
n_features = dl.get_data()[0].shape[1]
times = dl.get_data()[1]
events = dl.get_data()[2]
n_censored = int(events.shape[0] - events.sum())
max_t = times.max()

print(f"Number of samples: {n_samples}")
print(f"Number of features: {n_features}")
print(f"Max t: {max_t}")
print(f"Number of censored/uncensored: {n_censored}/{events.shape[0]-n_censored}")
print(f"{round(n_censored / events.shape[0] * 100, 2)}% of records are censored")
print(f"{round((events.shape[0]-n_censored) / events.shape[0] * 100, 2)}% of records are uncensored\n")

Number of samples: 24516
Number of features: 100
Max t: 4686
Number of censored/uncensored: 15390/9126
62.78% of records are censored
37.22% of records are uncensored



In [71]:
dl = MimicCompetingDataLoader().load_data()
n_samples = dl.get_data()[0].shape[0]
n_features = dl.get_data()[0].shape[1]
n_events = dl.n_events

for i in range(n_events):
    times = dl.get_data()[1]
    events = (dl.get_data()[2] == i+1)*1.0
    n_censored = int(events.shape[0] - events.sum())
    max_t = times.max()
    print(f"Number of samples: {n_samples}")
    print(f"Number of features: {n_features}")
    print(f"Max t: {max_t}")
    print(f"Number of censored/uncensored: {n_censored}/{events.shape[0]-n_censored}")
    print(f"{round(n_censored / events.shape[0] * 100, 2)}% of records are censored")
    print(f"{round((events.shape[0]-n_censored) / events.shape[0] * 100, 2)}% of records are uncensored\n")

Number of samples: 24516
Number of features: 100
Max t: 4686.0
Number of censored/uncensored: 23000/1516
93.82% of records are censored
6.18% of records are uncensored

Number of samples: 24516
Number of features: 100
Max t: 4686.0
Number of censored/uncensored: 23592/924
96.23% of records are censored
3.77% of records are uncensored

Number of samples: 24516
Number of features: 100
Max t: 4686.0
Number of censored/uncensored: 15390/9126
62.78% of records are censored
37.22% of records are uncensored



In [72]:
dl = MimicMultiDataLoader().load_data()
n_samples = dl.get_data()[0].shape[0]
n_features = dl.get_data()[0].shape[1]
n_events = dl.n_events

for i in range(n_events):
    times = dl.get_data()[1][:,i]
    events = dl.get_data()[2][:,i]
    n_censored = int(events.shape[0] - events.sum())
    max_t = times.max()
    print(f"Number of samples: {n_samples}")
    print(f"Number of features: {n_features}")
    print(f"Max t: {max_t}")
    print(f"Number of censored/uncensored: {n_censored}/{events.shape[0]-n_censored}")
    print(f"{round(n_censored / events.shape[0] * 100, 2)}% of records are censored")
    print(f"{round((events.shape[0]-n_censored) / events.shape[0] * 100, 2)}% of records are uncensored\n")

Number of samples: 24516
Number of features: 100
Max t: 4686.0
Number of censored/uncensored: 19441/5075
79.3% of records are censored
20.7% of records are uncensored

Number of samples: 24516
Number of features: 100
Max t: 4686.0
Number of censored/uncensored: 19811/4705
80.81% of records are censored
19.19% of records are uncensored

Number of samples: 24516
Number of features: 100
Max t: 4686.0
Number of censored/uncensored: 15390/9126
62.78% of records are censored
37.22% of records are uncensored



In [73]:
dl = RotterdamCompetingDataLoader().load_data()
n_samples = dl.get_data()[0].shape[0]
n_features = dl.get_data()[0].shape[1]
n_events = dl.n_events

for i in range(n_events):
    times = dl.get_data()[1]
    events = (dl.get_data()[2] == i+1)*1.0
    n_censored = int(events.shape[0] - events.sum())
    max_t = times.max()
    print(f"Number of samples: {n_samples}")
    print(f"Number of features: {n_features}")
    print(f"Max t: {max_t}")
    print(f"Number of censored/uncensored: {n_censored}/{events.shape[0]-n_censored}")
    print(f"{round(n_censored / events.shape[0] * 100, 2)}% of records are censored")
    print(f"{round((events.shape[0]-n_censored) / events.shape[0] * 100, 2)}% of records are uncensored\n")

Number of samples: 2982
Number of features: 10
Max t: 7043
Number of censored/uncensored: 2787/195
93.46% of records are censored
6.54% of records are uncensored

Number of samples: 2982
Number of features: 10
Max t: 7043
Number of censored/uncensored: 1464/1518
49.09% of records are censored
50.91% of records are uncensored



In [2]:
dl = RotterdamMultiDataLoader().load_data()
n_samples = dl.get_data()[0].shape[0]
n_features = dl.get_data()[0].shape[1]
n_events = dl.n_events

for i in range(n_events):
    times = dl.get_data()[1][:,i]
    events = dl.get_data()[2][:,i]
    n_censored = int(events.shape[0] - events.sum())
    max_t = times.max()
    print(f"Number of samples: {n_samples}")
    print(f"Number of features: {n_features}")
    print(f"Max t: {max_t}")
    print(f"Number of censored/uncensored: {n_censored}/{events.shape[0]-n_censored}")
    print(f"{round(n_censored / events.shape[0] * 100, 2)}% of records are censored")
    print(f"{round((events.shape[0]-n_censored) / events.shape[0] * 100, 2)}% of records are uncensored\n")

Number of samples: 2982
Number of features: 10
Max t: 7043
Number of censored/uncensored: 1464/1518
49.09% of records are censored
50.91% of records are uncensored

Number of samples: 2982
Number of features: 10
Max t: 7043
Number of censored/uncensored: 1710/1272
57.34% of records are censored
42.66% of records are uncensored



In [3]:
dl = PROACTMultiDataLoader().load_data()
n_samples = dl.get_data()[0].shape[0]
n_features = dl.get_data()[0].shape[1]
n_events = dl.n_events

for i in range(n_events):
    times = dl.get_data()[1][:,i]
    events = dl.get_data()[2][:,i]
    n_censored = int(events.shape[0] - events.sum())
    max_t = times.max()
    print(f"Number of samples: {n_samples}")
    print(f"Number of features: {n_features}")
    print(f"Max t: {max_t}")
    print(f"Number of censored/uncensored: {n_censored}/{events.shape[0]-n_censored}")
    print(f"{round(n_censored / events.shape[0] * 100, 2)}% of records are censored")
    print(f"{round((events.shape[0]-n_censored) / events.shape[0] * 100, 2)}% of records are uncensored\n")

Number of samples: 1807
Number of features: 9
Max t: 365.0
Number of censored/uncensored: 1176/631
65.08% of records are censored
34.92% of records are uncensored

Number of samples: 1807
Number of features: 9
Max t: 365.0
Number of censored/uncensored: 1286/521
71.17% of records are censored
28.83% of records are uncensored

Number of samples: 1807
Number of features: 9
Max t: 365.0
Number of censored/uncensored: 974/833
53.9% of records are censored
46.1% of records are uncensored

Number of samples: 1807
Number of features: 9
Max t: 365.0
Number of censored/uncensored: 794/1013
43.94% of records are censored
56.06% of records are uncensored



In [2]:
dl = EBMTDataLoader().load_data()
n_samples = dl.get_data()[0].shape[0]
n_features = dl.get_data()[0].shape[1]
n_events = dl.n_events

for i in range(n_events):
    times = dl.get_data()[1][:,i]
    events = dl.get_data()[2][:,i]
    n_censored = int(events.shape[0] - events.sum())
    max_t = times.max()
    print(f"Number of samples: {n_samples}")
    print(f"Number of features: {n_features}")
    print(f"Max t: {max_t}")
    print(f"Number of censored/uncensored: {n_censored}/{events.shape[0]-n_censored}")
    print(f"{round(n_censored / events.shape[0] * 100, 2)}% of records are censored")
    print(f"{round((events.shape[0]-n_censored) / events.shape[0] * 100, 2)}% of records are uncensored\n")

Number of samples: 2279
Number of features: 6
Max t: 6228.0
Number of censored/uncensored: 1494/785
65.56% of records are censored
34.44% of records are uncensored

Number of samples: 2279
Number of features: 6
Max t: 6299.0
Number of censored/uncensored: 1372/907
60.2% of records are censored
39.8% of records are uncensored

Number of samples: 2279
Number of features: 6
Max t: 6299.0
Number of censored/uncensored: 1619/660
71.04% of records are censored
28.96% of records are uncensored

Number of samples: 2279
Number of features: 6
Max t: 6299.0
Number of censored/uncensored: 1909/370
83.76% of records are censored
16.24% of records are uncensored

Number of samples: 2279
Number of features: 6
Max t: 6299.0
Number of censored/uncensored: 1746/533
76.61% of records are censored
23.39% of records are uncensored

