In [1]:
import h5py
import pandas as pd
import numpy as np

In [None]:
file_path = "raw/metabric.h5"

# Load the datasets from the HDF5 file into Pandas DataFrames
with h5py.File(file_path, 'r') as f:
    # Load train data
    x_train = f['train/x'][:]
    t_train = f['train/t'][:]
    e_train = f['train/e'][:]
    
    # Load test data
    x_test = f['test/x'][:]
    t_test = f['test/t'][:]
    e_test = f['test/e'][:]

# Create DataFrames
df_train = pd.DataFrame(x_train)
df_train['time'] = t_train
df_train['event'] = e_train

df_test = pd.DataFrame(x_test)
df_test['time'] = t_test
df_test['event'] = e_test

In [3]:
df_train = df_train.sort_values(by='time').reset_index(drop=True)
df_test = df_test.sort_values(by='time').reset_index(drop=True)

df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,time,event
0,5.818934,6.470783,10.672935,5.630679,0.0,0.0,0.0,1.0,75.330002,0.1,1
1,5.705204,8.450347,10.859011,5.667925,0.0,0.0,0.0,1.0,73.639999,1.233333,0
2,5.18406,8.427523,10.361415,5.575082,1.0,0.0,0.0,1.0,73.980003,1.266667,0
3,5.621474,5.456216,9.500981,5.753597,1.0,0.0,0.0,1.0,34.68,1.433333,0
4,6.621584,5.179721,13.374391,5.692907,0.0,0.0,0.0,0.0,53.16,1.766667,0


In [4]:
df_train = df_train.drop(columns=[4, 5, 6, 7])
df_test = df_test.drop(columns=[4, 5, 6, 7])

df_train.head()

Unnamed: 0,0,1,2,3,8,time,event
0,5.818934,6.470783,10.672935,5.630679,75.330002,0.1,1
1,5.705204,8.450347,10.859011,5.667925,73.639999,1.233333,0
2,5.18406,8.427523,10.361415,5.575082,73.980003,1.266667,0
3,5.621474,5.456216,9.500981,5.753597,34.68,1.433333,0
4,6.621584,5.179721,13.374391,5.692907,53.16,1.766667,0


In [5]:
new_column_names = {
    0: 'MKI67',
    1: 'EGFR',
    2: 'PGR',
    3: 'ERBB2',
    8: 'age at diagnosis'
}

# Rename in both train and test
df_train = df_train.rename(columns=new_column_names)
df_test = df_test.rename(columns=new_column_names)
df_train.head()

Unnamed: 0,MKI67,EGFR,PGR,ERBB2,age at diagnosis,time,event
0,5.818934,6.470783,10.672935,5.630679,75.330002,0.1,1
1,5.705204,8.450347,10.859011,5.667925,73.639999,1.233333,0
2,5.18406,8.427523,10.361415,5.575082,73.980003,1.266667,0
3,5.621474,5.456216,9.500981,5.753597,34.68,1.433333,0
4,6.621584,5.179721,13.374391,5.692907,53.16,1.766667,0


In [6]:
df_all = pd.concat([df_train, df_test], axis=0).reset_index(drop=True)
df_all = df_all.sort_values(by='time').reset_index(drop=True)
df_all = df_all.rename(columns={'event': 'failure'})
df_all.head()

Unnamed: 0,MKI67,EGFR,PGR,ERBB2,age at diagnosis,time,failure
0,7.220886,5.526921,9.640237,6.258209,51.419998,0.0,0
1,5.818934,6.470783,10.672935,5.630679,75.330002,0.1,1
2,10.117913,5.335094,9.717084,5.893656,54.099998,0.766667,0
3,5.705204,8.450347,10.859011,5.667925,73.639999,1.233333,0
4,5.18406,8.427523,10.361415,5.575082,73.980003,1.266667,0


In [7]:
df_all['failure'] = df_all['failure'].astype(bool)
df_all.head()

Unnamed: 0,MKI67,EGFR,PGR,ERBB2,age at diagnosis,time,failure
0,7.220886,5.526921,9.640237,6.258209,51.419998,0.0,False
1,5.818934,6.470783,10.672935,5.630679,75.330002,0.1,True
2,10.117913,5.335094,9.717084,5.893656,54.099998,0.766667,False
3,5.705204,8.450347,10.859011,5.667925,73.639999,1.233333,False
4,5.18406,8.427523,10.361415,5.575082,73.980003,1.266667,False


In [8]:
df_all.to_pickle("../../data/processed/metabric_processed.pkl")