In [1]:
import pickle
from tableone import TableOne # for the descriptive table
import os
import pandas as pd

In [2]:
# Read already prepared and saved data
with open('data/data_clean.pkl', 'rb') as f:
    data = pickle.load(f)

# Exploratory data analysis

## Original data

In [3]:
# Explore the original data
print('Original data shape: ', data.shape)
print('\nSample data:\n', data.head(), '\n')

Original data shape:  (1564, 22)

Sample data:
     age    psa clinical_stage  biopsy_gleason biopsy_gleason_gg  \
0  72.0  12.70              2             7.0                 2   
1  66.0   7.59              3             9.0                 5   
2  65.0   7.60              3             7.0                 2   
3  70.0  14.18              3             7.0                 3   
4  57.0   7.86              2             7.0                 3   

  pathological_gleason_gg pathologic_stage  lni surgical_margin_status  \
0                       2                2  1.0                      1   
1                       5                1  1.0                      1   
2                       2                2  1.0                      0   
3                       3                2  1.0                      1   
4                       4                1  1.0                      0   

  persistent_psa  ... survival_months overall_mortality  \
0              0  ...             138        

In [4]:
print('\nDescribing data:\n', data.describe(), '\n')


Describing data:
                age          psa  biopsy_gleason  survival_months  \
count  1564.000000  1564.000000     1564.000000      1564.000000   
mean     63.150256     8.542887        6.514706       108.690537   
std       6.503490     7.252175        0.736865        54.528575   
min      40.000000     0.440000        3.000000         1.000000   
25%      59.000000     4.800000        6.000000        65.000000   
50%      64.000000     6.440000        6.000000       104.000000   
75%      68.000000     9.692500        7.000000       159.000000   
max      87.000000    98.400000       10.000000       225.000000   

       pathologic_gleason  survival_months_bcr  survival_months_mts  \
count         1564.000000          1564.000000          1564.000000   
mean             6.898977            57.438619            74.636189   
std              0.777992            52.240432            53.828630   
min              4.000000             1.000000             1.000000   
25%          

In [20]:
# Descriptive statistics
min_max = ['age']
nonnormal = ['age', 'psa', 'BxGleason', 'Rpgleson', 'survival_months', 
             'survival_months_bcr', 'survival_months_mts']
categorical = ['clinical_stage', 'lni', 'biopsy_gleason_gg', 'pathological_gleason_gg',
              'pathologic_stage', 'surgical_margin_status', 'persistent_psa', 'TRYSgrupes',
              'PLNDO1']

In [12]:
if not os.path.exists('eda'):
   os.makedirs('eda')

In [23]:
# Overall
columns = data.columns.tolist()
columns_to_remove = ['patient_id']
columns = [column for column in columns if column not in columns_to_remove]
overall_categorical = categorical.copy()
overall_categorical.extend(['bcr', 'mts', 'cancer_specific_mortality', 'death_from_other_causes', 'overall_mortality'])

mytable = TableOne(data, columns=columns, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=overall_categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/'+'overall.xlsx')

|                                     |         | Overall            |
|-------------------------------------|---------|--------------------|
| n                                   |         | 1564               |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      |
| clinical_stage, n (%)               | 1       | 434 (27.7)         |
|                                     | 2       | 911 (58.2)         |
|                                     | 3       | 219 (14.0)         |
| biopsy_gleason, mean (SD)           |         | 6.5 (0.7)          |
| biopsy_gleason_gg, n (%)            | 1       | 894 (57.2)         |
|                                     | 2       | 479 (30.6)         |
|                                     | 3       | 75 (4.8)           |
|                                     | 4       | 79 (5.1)           |
|                                     | 5       | 37 (2.4)           |
| path

In [18]:
# Overall mortality
columns = data.columns.tolist()
columns_to_remove = ['patient_id', 'bcr', 'mts', 'cancer_specific_mortality', 'death_from_other_causes']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'overall_mortality'

mytable = TableOne(data, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/'+groupby+'.xlsx')


|                                     |         | Overall            | 0                  | 1                 |
|-------------------------------------|---------|--------------------|--------------------|-------------------|
| n                                   |         | 1564               | 1300               | 264               |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   | 63.0 [40.0,87.0]   | 68.0 [53.0,78.0]  |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      | 6.3 [4.7,9.3]      | 7.2 [5.2,10.5]    |
| clinical_stage, n (%)               | 1       | 434 (27.7)         | 376 (28.9)         | 58 (22.0)         |
|                                     | 2       | 911 (58.2)         | 754 (58.0)         | 157 (59.5)        |
|                                     | 3       | 219 (14.0)         | 170 (13.1)         | 49 (18.6)         |
| biopsy_gleason, mean (SD)           |         | 6.5 (0.7)          | 6.5 (0.7)          | 6.5 (0.9)   

In [10]:
# Cancer Specific Mortality
columns = data.columns.tolist()
columns_to_remove = ['patient_id', 'bcr', 'mts', 'overall_mortality', 'death_from_other_causes']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'cancer_specific_mortality'

mytable = TableOne(data, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/'+groupby+'.xlsx')


|                                     |         | Overall            | 0                  | 1                 |
|-------------------------------------|---------|--------------------|--------------------|-------------------|
| n                                   |         | 1564               | 1514               | 50                |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   | 64.0 [40.0,87.0]   | 68.0 [54.0,73.0]  |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      | 6.4 [4.8,9.7]      | 7.9 [5.3,11.2]    |
| clinical_stage, n (%)               | 1       | 434 (27.7)         | 430 (28.4)         | 4 (8.0)           |
|                                     | 2       | 911 (58.2)         | 882 (58.3)         | 29 (58.0)         |
|                                     | 3       | 219 (14.0)         | 202 (13.3)         | 17 (34.0)         |
| biopsy_gleason, mean (SD)           |         | 6.5 (0.7)          | 6.5 (0.7)          | 7.1 (1.2)   

In [11]:
# Death from other causes
columns = data.columns.tolist()
columns_to_remove = ['patient_id', 'bcr', 'mts', 'cancer_specific_mortality', 'overall_mortality']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'death_from_other_causes'

mytable = TableOne(data, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/'+groupby+'.xlsx')


|                                     |         | Overall            | 0                  | 1                 |
|-------------------------------------|---------|--------------------|--------------------|-------------------|
| n                                   |         | 1564               | 1350               | 214               |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   | 63.0 [40.0,87.0]   | 68.0 [53.0,78.0]  |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      | 6.3 [4.8,9.5]      | 7.2 [5.2,10.5]    |
| clinical_stage, n (%)               | 1       | 434 (27.7)         | 380 (28.1)         | 54 (25.2)         |
|                                     | 2       | 911 (58.2)         | 783 (58.0)         | 128 (59.8)        |
|                                     | 3       | 219 (14.0)         | 187 (13.9)         | 32 (15.0)         |
| biopsy_gleason, mean (SD)           |         | 6.5 (0.7)          | 6.5 (0.7)          | 6.3 (0.8)   

In [12]:
# BCR
columns = data.columns.tolist()
columns_to_remove = ['patient_id', 'overall_mortality', 'mts', 'cancer_specific_mortality', 'death_from_other_causes']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'bcr'

mytable = TableOne(data, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/'+groupby+'.xlsx')


|                                     |         | Overall            | 0                  | 1                  |
|-------------------------------------|---------|--------------------|--------------------|--------------------|
| n                                   |         | 1564               | 1104               | 460                |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   | 64.0 [40.0,87.0]   | 64.0 [44.0,78.0]   |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      | 6.0 [4.6,8.5]      | 8.6 [5.5,13.1]     |
| clinical_stage, n (%)               | 1       | 434 (27.7)         | 345 (31.2)         | 89 (19.3)          |
|                                     | 2       | 911 (58.2)         | 656 (59.4)         | 255 (55.4)         |
|                                     | 3       | 219 (14.0)         | 103 (9.3)          | 116 (25.2)         |
| biopsy_gleason, mean (SD)           |         | 6.5 (0.7)          | 6.4 (0.6)          | 6.8 

In [13]:
# MTS
columns = data.columns.tolist()
columns_to_remove = ['patient_id', 'bcr', 'overall_mortality', 'cancer_specific_mortality', 'death_from_other_causes']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'mts'

mytable = TableOne(data, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/'+groupby+'.xlsx')

|                                     |         | Overall            | 0                  | 1                 |
|-------------------------------------|---------|--------------------|--------------------|-------------------|
| n                                   |         | 1564               | 1465               | 99                |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   | 64.0 [40.0,87.0]   | 65.0 [48.0,73.0]  |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      | 6.3 [4.8,9.4]      | 8.8 [6.5,14.3]    |
| clinical_stage, n (%)               | 1       | 434 (27.7)         | 426 (29.1)         | 8 (8.1)           |
|                                     | 2       | 911 (58.2)         | 852 (58.2)         | 59 (59.6)         |
|                                     | 3       | 219 (14.0)         | 187 (12.8)         | 32 (32.3)         |
| biopsy_gleason, mean (SD)           |         | 6.5 (0.7)          | 6.5 (0.7)          | 7.1 (1.1)   

## Train - test dataset overview

In [3]:
trainFilePath = 'data/data_train.pkl'
testFilePath = 'data/data_test.pkl'
trainHomogenousFilePath = 'data/data_train_homogenous.pkl'

print('--------------------------------------')
print(f'Reading train data from: {trainFilePath} ........')
with open(trainFilePath, 'rb') as f:
    data_train = pickle.load(f)
print(f'Finished reading, loaded train dataset shape: {data_train.shape}')
print(f'Read train column names: {data_train.columns}\n')

print(f'Reading test data from: {testFilePath} ........')
with open(testFilePath, 'rb') as f:
    data_test = pickle.load(f)
print(f'Finished reading, loaded test dataset shape: {data_test.shape}')
print(f'Read test column names: {data_test.columns}\n')

print(f'Reading train homogenous data from: {trainHomogenousFilePath} ........')
with open(trainHomogenousFilePath, 'rb') as f:
    data_train_homogenous = pickle.load(f)
print(f'Finished reading, loaded train homogenous dataset shape: {data_train_homogenous.shape}')
print(f'Read train homogenous column names: {data_train_homogenous.columns}\n')

--------------------------------------
Reading train data from: data/data_train.pkl ........
Finished reading, loaded train dataset shape: (1251, 21)
Read train column names: Index(['age', 'psa', 'clinical_stage', 'biopsy_gleason', 'biopsy_gleason_gg',
       'pathological_gleason_gg', 'pathologic_stage', 'lni',
       'surgical_margin_status', 'persistent_psa', 'survival_months',
       'pathologic_gleason', 'TRYSgrupes', 'PLNDO1', 'survival_months_bcr',
       'survival_months_mts', 'patient_id', 'bcr', 'mts',
       'death_from_other_causes', 'cancer_specific_mortality'],
      dtype='object')

Reading test data from: data/data_test.pkl ........
Finished reading, loaded test dataset shape: (313, 21)
Read test column names: Index(['age', 'psa', 'clinical_stage', 'biopsy_gleason', 'biopsy_gleason_gg',
       'pathological_gleason_gg', 'pathologic_stage', 'lni',
       'surgical_margin_status', 'persistent_psa', 'survival_months',
       'pathologic_gleason', 'TRYSgrupes', 'PLNDO1', 's

Based on untransformed split

In [15]:
# -------------------------
# Train (original) - Test
# -------------------------
categorical = ['clinical_stage', 'lni', 'biopsy_gleason_gg', 'pathological_gleason_gg',
              'pathologic_stage', 'surgical_margin_status', 'persistent_psa', 'TRYSgrupes',
              'PLNDO1', 'bcr', 'mts', 'death_from_other_causes', 'cancer_specific_mortality']

temp_train = data_train.copy()
temp_train['split'] = '1_train'

temp_test = data_test.copy()
temp_test['split'] = '2_test'

temp = pd.concat([temp_train, temp_test])
columns = temp.columns.tolist()
columns_to_remove = ['patient_id']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'split'

mytable = TableOne(temp, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/train-test-split-original.xlsx')
# -------------------------

|                                     |         | Overall            | 1_train            | 2_test             |
|-------------------------------------|---------|--------------------|--------------------|--------------------|
| n                                   |         | 1564               | 1251               | 313                |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   | 64.0 [40.0,78.0]   | 63.0 [45.0,87.0]   |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      | 6.5 [4.8,9.7]      | 6.2 [4.8,9.3]      |
| clinical_stage, n (%)               | 1       | 434 (27.7)         | 347 (27.7)         | 87 (27.8)          |
|                                     | 2       | 911 (58.2)         | 729 (58.3)         | 182 (58.1)         |
|                                     | 3       | 219 (14.0)         | 175 (14.0)         | 44 (14.1)          |
| biopsy_gleason, mean (SD)           |         | 6.5 (0.7)          | 6.5 (0.7)          | 6.5 

Based on homogenous training set

In [16]:
# -------------------------
# Train (homogenous) - Test
# -------------------------
categorical = ['clinical_stage', 'lni', 'biopsy_gleason_gg', 'pathological_gleason_gg',
              'pathologic_stage', 'surgical_margin_status', 'persistent_psa', 'TRYSgrupes',
              'PLNDO1', 'bcr', 'mts', 'death_from_other_causes', 'cancer_specific_mortality']

temp_train = data_train_homogenous.copy()
temp_train['split'] = '1_train_homogenous'

temp_test = data_test.copy()
temp_test['split'] = '2_test'

temp = pd.concat([temp_train, temp_test])
columns = temp.columns.tolist()
columns_to_remove = ['patient_id']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'split'

mytable = TableOne(temp, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/train-test-split-homogenous.xlsx')
# -------------------------

|                                     |         | Overall           | 1_train_homogenous   | 2_test             |
|-------------------------------------|---------|-------------------|----------------------|--------------------|
| n                                   |         | 401               | 88                   | 313                |
| Age (years), median [min,max]       |         | 63.0 [45.0,87.0]  | 65.0 [51.0,76.0]     | 63.0 [45.0,87.0]   |
| psa, median [Q1,Q3]                 |         | 6.8 [5.0,11.2]    | 10.9 [6.5,19.0]      | 6.2 [4.8,9.3]      |
| clinical_stage, n (%)               | 1       | 93 (23.2)         | 6 (6.8)              | 87 (27.8)          |
|                                     | 2       | 227 (56.6)        | 45 (51.1)            | 182 (58.1)         |
|                                     | 3       | 81 (20.2)         | 37 (42.0)            | 44 (14.1)          |
| biopsy_gleason, mean (SD)           |         | 6.7 (0.9)         | 7.2 (1.0)         

## Survival times

In [25]:
# Check min and max survival times
print(f'Max training survival months - {data_train["survival_months"].max()}')
print(f'Min training survival months - {data_train["survival_months"].min()}')

print(f'\nMax homogenous training survival months - {data_train_homogenous["survival_months"].max()}')
print(f'Min homogenous training survival months - {data_train_homogenous["survival_months"].min()}')

print(f'\nMax testing survival months - {data_test["survival_months"].max()}')
print(f'Min testing survival months - {data_test["survival_months"].min()}')


print(f'\nBCR:\n')
print(f'Max training survival months - {data_train["survival_months_bcr"].max()}')
print(f'Min training survival months - {data_train["survival_months_bcr"].min()}')

print(f'\nMax homogenous training survival months - {data_train_homogenous["survival_months_bcr"].max()}')
print(f'Min homogenous training survival months - {data_train_homogenous["survival_months_bcr"].min()}')

print(f'\nMax testing survival months - {data_test["survival_months_bcr"].max()}')
print(f'Min testing survival months - {data_test["survival_months_bcr"].min()}')


print(f'\nMTS:\n')
print(f'Max training survival months - {data_train["survival_months_mts"].max()}')
print(f'Min training survival months - {data_train["survival_months_mts"].min()}')

print(f'\nMax homogenous training survival months - {data_train_homogenous["survival_months_mts"].max()}')
print(f'Min homogenous training survival months - {data_train_homogenous["survival_months_mts"].min()}')

print(f'\nMax testing survival months - {data_test["survival_months_mts"].max()}')
print(f'Min testing survival months - {data_test["survival_months_mts"].min()}')

Max training survival months - 225
Min training survival months - 1

Max homogenous training survival months - 218
Min homogenous training survival months - 1

Max testing survival months - 224
Min testing survival months - 5

BCR:

Max training survival months - 214
Min training survival months - 1

Max homogenous training survival months - 164
Min homogenous training survival months - 1

Max testing survival months - 219
Min testing survival months - 1

MTS:

Max training survival months - 214
Min training survival months - 1

Max homogenous training survival months - 212
Min homogenous training survival months - 1

Max testing survival months - 219
Min testing survival months - 2


## patients who experienced death

In [26]:
print('Patient ids who experienced cancer specific mortality in training set:')
print(data_train.loc[data_train['cancer_specific_mortality'] == '1'].patient_id.values)

print('\nPatient ids who experienced cancer specific mortality in homogenous training set:')
print(data_train_homogenous.loc[data_train_homogenous['cancer_specific_mortality'] == '1'].patient_id.values)

print('\nPatient ids who experienced cancer specific mortality in testing set:')
print(data_test.loc[data_test['cancer_specific_mortality'] == '1'].patient_id.values)

Patient ids who experienced cancer specific mortality in training set:
[ 223  393   19 1051   22  624  247  844  615  111    2  855  671   30
  243  312  668  193  516  578   17  703  176  594  182  896   20 1412
   31  174 1261   28  520  155  179   34 1256   16  173   35  177  623]

Patient ids who experienced cancer specific mortality in homogenous training set:
[223  19  22 615 111   2 671  30 668  17  31  28 179  16  35]

Patient ids who experienced cancer specific mortality in testing set:
[ 195   23  142  641 1003  194   41   25]
