In [8]:
from pickle import load
from tableone import TableOne # for the descriptive table
import os

In [2]:
# Read already prepared and saved data
with open('data/data_clean.pkl', 'rb') as f:
    data = load(f)

In [3]:
# Explore the original data
print('Original data shape: ', data.shape)
print('\nSample data:\n', data.head(), '\n')

Original data shape:  (1564, 22)

Sample data:
     age    psa clinical_stage  BxGleason biopsy_gleason_gg  \
0  72.0  12.70              2        7.0                 2   
1  66.0   7.59              3        9.0                 5   
2  65.0   7.60              3        7.0                 2   
3  70.0  14.18              3        7.0                 3   
4  57.0   7.86              2        7.0                 3   

  pathological_gleason_gg pathologic_stage  lni surgical_margin_status  \
0                       2                2  1.0                      1   
1                       5                1  1.0                      1   
2                       2                2  1.0                      0   
3                       3                2  1.0                      1   
4                       4                1  1.0                      0   

  persistent_psa  ... survival_months overall_mortality  \
0              0  ...             138                 0   
1              0

In [4]:
print('\nDescribing data:\n', data.describe(), '\n')


Describing data:
                age          psa    BxGleason  survival_months     Rpgleson  \
count  1564.000000  1564.000000  1564.000000      1564.000000  1564.000000   
mean     63.150256     8.542887     6.514706       108.690537     6.898977   
std       6.503490     7.252175     0.736865        54.528575     0.777992   
min      40.000000     0.440000     3.000000         1.000000     4.000000   
25%      59.000000     4.800000     6.000000        65.000000     6.000000   
50%      64.000000     6.440000     6.000000       104.000000     7.000000   
75%      68.000000     9.692500     7.000000       159.000000     7.000000   
max      87.000000    98.400000    10.000000       225.000000    10.000000   

       survival_months_bcr  survival_months_mts   patient_id  
count          1564.000000          1564.000000  1564.000000  
mean             57.438619            74.636189   782.500000  
std              52.240432            53.828630   451.632225  
min               1.000000

In [5]:
# Descriptive statistics
min_max = ['age']
nonnormal = ['age', 'psa', 'BxGleason', 'Rpgleson', 'survival_months', 
             'survival_months_bcr', 'survival_months_mts']
categorical = ['clinical_stage', 'lni', 'biopsy_gleason_gg', 'pathological_gleason_gg',
              'pathologic_stage', 'surgical_margin_status', 'persistent_psa', 'TRYSgrupes',
              'PLNDO1']

In [9]:
if not os.path.exists('eda'):
   os.makedirs('eda')

In [10]:
# Overall mortality
columns = data.columns.tolist()
columns_to_remove = ['patient_id', 'bcr', 'mts', 'cancer_specific_mortality', 'death_from_other_causes']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'overall_mortality'

mytable = TableOne(data, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/'+groupby+'.xlsx')


|                                     |         | Overall            | 0                  | 1                 |
|-------------------------------------|---------|--------------------|--------------------|-------------------|
| n                                   |         | 1564               | 1300               | 264               |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   | 63.0 [40.0,87.0]   | 68.0 [53.0,78.0]  |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      | 6.3 [4.7,9.3]      | 7.2 [5.2,10.5]    |
| clinical_stage, n (%)               | 1       | 434 (27.7)         | 376 (28.9)         | 58 (22.0)         |
|                                     | 2       | 911 (58.2)         | 754 (58.0)         | 157 (59.5)        |
|                                     | 3       | 219 (14.0)         | 170 (13.1)         | 49 (18.6)         |
| BxGleason, median [Q1,Q3]           |         | 6.0 [6.0,7.0]      | 6.0 [6.0,7.0]      | 6.0 [6.0,7.0

In [11]:
# Cancer Specific Mortality
columns = data.columns.tolist()
columns_to_remove = ['patient_id', 'bcr', 'mts', 'overall_mortality', 'death_from_other_causes']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'cancer_specific_mortality'

mytable = TableOne(data, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/'+groupby+'.xlsx')


|                                     |         | Overall            | 0                  | 1                 |
|-------------------------------------|---------|--------------------|--------------------|-------------------|
| n                                   |         | 1564               | 1514               | 50                |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   | 64.0 [40.0,87.0]   | 68.0 [54.0,73.0]  |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      | 6.4 [4.8,9.7]      | 7.9 [5.3,11.2]    |
| clinical_stage, n (%)               | 1       | 434 (27.7)         | 430 (28.4)         | 4 (8.0)           |
|                                     | 2       | 911 (58.2)         | 882 (58.3)         | 29 (58.0)         |
|                                     | 3       | 219 (14.0)         | 202 (13.3)         | 17 (34.0)         |
| BxGleason, median [Q1,Q3]           |         | 6.0 [6.0,7.0]      | 6.0 [6.0,7.0]      | 7.0 [6.0,8.0

In [12]:
# Death from other causes
columns = data.columns.tolist()
columns_to_remove = ['patient_id', 'bcr', 'mts', 'cancer_specific_mortality', 'overall_mortality']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'death_from_other_causes'

mytable = TableOne(data, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/'+groupby+'.xlsx')


|                                     |         | Overall            | 0                  | 1                 |
|-------------------------------------|---------|--------------------|--------------------|-------------------|
| n                                   |         | 1564               | 1350               | 214               |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   | 63.0 [40.0,87.0]   | 68.0 [53.0,78.0]  |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      | 6.3 [4.8,9.5]      | 7.2 [5.2,10.5]    |
| clinical_stage, n (%)               | 1       | 434 (27.7)         | 380 (28.1)         | 54 (25.2)         |
|                                     | 2       | 911 (58.2)         | 783 (58.0)         | 128 (59.8)        |
|                                     | 3       | 219 (14.0)         | 187 (13.9)         | 32 (15.0)         |
| BxGleason, median [Q1,Q3]           |         | 6.0 [6.0,7.0]      | 6.0 [6.0,7.0]      | 6.0 [6.0,7.0

In [13]:
# BCR
columns = data.columns.tolist()
columns_to_remove = ['patient_id', 'overall_mortality', 'mts', 'cancer_specific_mortality', 'death_from_other_causes']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'bcr'

mytable = TableOne(data, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/'+groupby+'.xlsx')


|                                     |         | Overall            | 0                  | 1                  |
|-------------------------------------|---------|--------------------|--------------------|--------------------|
| n                                   |         | 1564               | 1104               | 460                |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   | 64.0 [40.0,87.0]   | 64.0 [44.0,78.0]   |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      | 6.0 [4.6,8.5]      | 8.6 [5.5,13.1]     |
| clinical_stage, n (%)               | 1       | 434 (27.7)         | 345 (31.2)         | 89 (19.3)          |
|                                     | 2       | 911 (58.2)         | 656 (59.4)         | 255 (55.4)         |
|                                     | 3       | 219 (14.0)         | 103 (9.3)          | 116 (25.2)         |
| BxGleason, median [Q1,Q3]           |         | 6.0 [6.0,7.0]      | 6.0 [6.0,7.0]      | 7.0 

In [14]:
# MTS
columns = data.columns.tolist()
columns_to_remove = ['patient_id', 'bcr', 'overall_mortality', 'cancer_specific_mortality', 'death_from_other_causes']
columns = [column for column in columns if column not in columns_to_remove]
groupby = 'mts'

mytable = TableOne(data, columns=columns, groupby=groupby, nonnormal=nonnormal, 
                   min_max=min_max, missing=False, rename={'age': 'Age (years)'}, 
                   sort=False, categorical=categorical)
print(mytable.tabulate(tablefmt="github"), '\n')
mytable.to_excel('eda/'+groupby+'.xlsx')

|                                     |         | Overall            | 0                  | 1                 |
|-------------------------------------|---------|--------------------|--------------------|-------------------|
| n                                   |         | 1564               | 1465               | 99                |
| Age (years), median [min,max]       |         | 64.0 [40.0,87.0]   | 64.0 [40.0,87.0]   | 65.0 [48.0,73.0]  |
| psa, median [Q1,Q3]                 |         | 6.4 [4.8,9.7]      | 6.3 [4.8,9.4]      | 8.8 [6.5,14.3]    |
| clinical_stage, n (%)               | 1       | 434 (27.7)         | 426 (29.1)         | 8 (8.1)           |
|                                     | 2       | 911 (58.2)         | 852 (58.2)         | 59 (59.6)         |
|                                     | 3       | 219 (14.0)         | 187 (12.8)         | 32 (32.3)         |
| BxGleason, median [Q1,Q3]           |         | 6.0 [6.0,7.0]      | 6.0 [6.0,7.0]      | 7.0 [6.0,8.0