In [1]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

from datetime import datetime

In [2]:
# Controls:
APPLY_MICE_TO_TEMPORAL = True
FFILL_FIRST_WHEN_APPLY_MICE_TO_TEMPORAL = True

In [3]:
# Prep directories.
dir_list = [
    "./data/processed/before imputation/",
    "./data/processed/after imputation/",
    "./data/processed/after imputation (MICE)/",
]

for dir_ in dir_list:
    if not os.path.exists(dir_):
        os.makedirs(dir_)

In [4]:
df = pd.read_csv('./data/STRATCANS 100221 fixed.csv')
df = df[df.notna().sum(axis=1) != 0]

In [5]:
'''
    There were two duplicated Pseudo IDs: '09.100.701' x2, '09.100.702' x2
    However, the duplicates had very different baseline information; I treat them as independent patients
'''


df = df.rename(columns={'Pseudo ID ': 'Pseudo ID'})
df = df.reset_index()
df = df.rename(columns={'index': 'New ID'})
df['New ID'] = np.arange(len(df))

In [6]:
'''
    Baseline Biopsy in ['HoLEP', 'TURP']:
    
    1) if first biopsy within 365 days & no 'NC'--> replace it
    2) if first biopsy within 365 days & 'NC'--> np.nan 
    3) if no first biopy within 365 days --> np.nan
'''
idx = df['Total number of biopsy cores used at diagnosis'].apply(lambda x: x in ['HoLEP', 'TURP']) 
idx = idx & (df['Days since first biopsy'] < 365)
idx = idx & (df['Primary Gleason.1'] != 'NC')

# all the patients had the same gleason scores; so these features are not upated.
df.loc[idx, 'Biopsy date']  = df.loc[idx, 'Repeat biopsy 1 date']
df.loc[idx, 'Total number of biopsy cores used at diagnosis']  = df.loc[idx, 'Repeat biopsy 1 core total'].astype(float)
df.loc[idx, 'Number of positive biospy cores']  = df.loc[idx, 'Repeat biopsy 1 core positive'].astype(float)
df.loc[idx, 'Core positivity']  = df.loc[idx, 'Number of positive biospy cores']/df.loc[idx, 'Total number of biopsy cores used at diagnosis']

df.loc[idx, 'Date of diagnosis'] = df.loc[idx, 'Biopsy date']

idx = df['Total number of biopsy cores used at diagnosis'].apply(lambda x: x in ['HoLEP', 'TURP']) 

df.loc[idx, 'Total number of biopsy cores used at diagnosis']  = np.nan
df.loc[idx, 'Number of positive biospy cores']  = np.nan
df.loc[idx, 'Core positivity']  = np.nan

In [7]:
baseline_dates = ['Pseudo ID', 'Date of diagnosis', 'Baseline PSA date', 'Biopsy date', 'Baseline or earliest MRI'] \
                 + ['PSA1', 'Date'] \
                 + ['Repeat biopsy 1 core total', 'Repeat biopsy 1 date'] \
                 + ['Date of 1st repeat MRI', 'PRECISE scoring', 'Stage at 1st repeat MRI', 'Volume at 1st repeat MRI'] 

df['Date of birth'] = df['Date of birth'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y') if pd.notna(x) else np.nan)
df['Date of diagnosis'] = df['Date of diagnosis'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y') if pd.notna(x) else np.nan)
df['Baseline PSA date'] = df['Baseline PSA date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y') if pd.notna(x) else np.nan)
df['Biopsy date']       = df['Biopsy date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y') if pd.notna(x) else np.nan)
df['Baseline or earliest MRI'] = df['Baseline or earliest MRI'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y') if pd.notna(x) and x!= 'No MRI' else np.nan)
df['Date'] = df['Date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y')  if pd.notna(x) else np.nan)
df['Repeat biopsy 1 date'] = df['Repeat biopsy 1 date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y')  if pd.notna(x) else np.nan)
df['Date of 1st repeat MRI'] = df['Date of 1st repeat MRI'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y')  if pd.notna(x) else np.nan)

df['Date of reaching endpoint OR last investigation date if censored'] = df['Date of reaching endpoint OR last investigation date if censored'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y') if pd.notna(x) else np.nan)

In [8]:
'''
Input from Vincent.
    From a pragmatic standpoint I suggest the following:
    1. For all cases where there is PSA, MRI, biopsy before (or around) the diagnosis date - use diagnosis date as start point 
    2. For all cases where there is PSA, biopsy and diagnosis but then MRI happens later but within 365 days 
       - ignore all follow up PSA before the first MRI and use the first PSA after the MRI
    3. For those whose MRI was later than 365 days - use the diagnosis date as start point and MRI baseline is NA
'''


tmp_df = pd.DataFrame([])

tmp_df['Diagnosis - PSA']    = df.apply(lambda x: (x.loc['Date of diagnosis'] - x.loc['Baseline PSA date']).days if pd.notna(x.loc['Baseline PSA date']) else np.nan, axis=1)
tmp_df['Diagnosis - Biopsy'] = df.apply(lambda x: (x.loc['Date of diagnosis'] - x.loc['Biopsy date']).days if pd.notna(x.loc['Biopsy date']) else np.nan, axis=1)
tmp_df['Diagnosis - MRI']    = df.apply(lambda x: (x.loc['Date of diagnosis'] - x.loc['Baseline or earliest MRI']).days if pd.notna(x.loc['Baseline or earliest MRI']) else np.nan, axis=1)


idx1 = (tmp_df['Diagnosis - PSA'] >= 0)&(tmp_df['Diagnosis - Biopsy'] >= 0)&(tmp_df['Diagnosis - MRI'] >=0)
idx2 = ~idx1
idx3 = idx2 & (tmp_df['Diagnosis - MRI'] >=-365)
idx3_1 = idx3 & (tmp_df['Diagnosis - PSA'] >= tmp_df['Diagnosis - MRI']) #MRI is taken later
idx3_2 = idx3 & (tmp_df['Diagnosis - PSA'] < tmp_df['Diagnosis - MRI'])  #PSA is taken later

idx4   = (tmp_df['Diagnosis - MRI'] <-365)
idx4_1 = idx4 & (tmp_df['Diagnosis - PSA'] >= 0)
idx4_2 = idx4 & (tmp_df['Diagnosis - PSA'] < 0) & (tmp_df['Diagnosis - PSA'] >= -365)
idx4_3 = idx4 & (tmp_df['Diagnosis - PSA'] < -365)

idx5   = np.isnan(tmp_df['Diagnosis - MRI'])
idx5_1 = idx5 & (tmp_df['Diagnosis - PSA'] >= 0)
idx5_2 = idx5 & (tmp_df['Diagnosis - PSA'] < 0) & (tmp_df['Diagnosis - PSA'] >= -365)
idx5_3 = idx5 & (tmp_df['Diagnosis - PSA'] < -365)

print('all measurements available before diagnosis: {}'.format(np.sum(idx1)))
print('one or more measurements not available before diagnosis: {}'.format(np.sum(idx2)))
print('MRI & PSA available within 365 days from diagnosis: {}'.format(np.sum(idx3)))
print('MRI not available within 365 days: : {}'.format(np.sum(idx4)))
print('MRI not available at all: : {}'.format(np.sum(idx5)))


df.loc[idx3,   'Date of diagnosis'] = df.loc[idx3, 'Date of diagnosis']
df.loc[idx3_1, 'Date of diagnosis'] = df.loc[idx3_1, 'Baseline or earliest MRI']
df.loc[idx3_2, 'Date of diagnosis'] = df.loc[idx3_2, 'Baseline PSA date']
df.loc[idx4_2, 'Date of diagnosis'] = df.loc[idx4_2, 'Baseline PSA date']
df.loc[idx5_2, 'Date of diagnosis'] = df.loc[idx5_2, 'Baseline PSA date']

df['Exact age at diagnosis'] = df.apply(lambda x: (x['Date of diagnosis'] - x['Date of birth']).days/365.25, axis=1)

all measurements available before diagnosis: 257
one or more measurements not available before diagnosis: 331
MRI & PSA available within 365 days from diagnosis: 164
MRI not available within 365 days: : 118
MRI not available at all: : 49


In [9]:
feat_baseline = ['New ID', 'Date of diagnosis', 'Exact age at diagnosis',
                 'Ethnicity', 'Family History of Prostate Cancer', 'Number of negative biopsies before diagnosis',
                 'Stage (highest of clinical or MRI stage)', 'PSA at diagnosis', 'Prostatic volume at diagnosis',
                 'PSA density at diagnosis',
                 'Total number of biopsy cores used at diagnosis', 'Number of positive biospy cores', 'Primary Gleason', 'Secondary Gleason', 'Grade group',
                 'Core positivity', 'CPG', 'PI-RADS score', 'Number of MRI-visible lesions', 'STRATCANS (simplified)'] + ['Biopsy date']

feat_label    = ['CPG3 Outcome', 'Coding.3', 'Date of reaching endpoint OR last investigation date if censored', 'Days since diagnosis.3']
# feat_label    = ['CPG3 Outcome', 'CPG3 endpoint', 'Coding.3',
#                  'Date of reaching endpoint OR last investigation date if censored', 'Days since diagnosis.3', 'Years since diagnosis.3']


feat_psa      = ['PSA1', 'Date', 'Days since first PSA']
for i in range(1, 45):
    feat_psa += ['PSA{}'.format(i+1), 'Date.{}'.format(i), 'Days since first PSA.{}'.format(i)]
                 

feat_biopsy   = ['Repeat biopsy 1 core total', 'Repeat biopsy 1 core positive', 'Primary Gleason.1', 'Secondary Gleason.1', 'Grade group.1', 'Repeat biopsy 1 date', 'Days since first biopsy']
for i in range(2, 6):
    feat_biopsy += ['Repeat biopsy {} core total'.format(i), 'Repeat biopsy {} core positive'.format(i), 'Primary Gleason.{}'.format(i), 'Secondary Gleason.{}'.format(i), 
                    'Grade group.{}'.format(i), 'Repeat biopsy {} date'.format(i), 'Days since first biopsy.{}'.format(i-1)]
    
feat_mri  = ['Date of 1st repeat MRI', 'PRECISE scoring', 'Stage at 1st repeat MRI', 'Volume at 1st repeat MRI', 'PSAd at 1st repeat MRI', 'Days since baseline MRI']
feat_mri += ['Date of 2nd repeat MRI', 'PRECISE scoring.1', 'Stage at 2nd repeat MRI', 'Volume at 2nd repeat MRI', 'PSAd at 2nd repeat MRI', 'Days since baseline MRI.1']
feat_mri += ['Date of 3rd repeat MRI', 'PRECISE scoring.2', 'Stage at 3rd repeat MRI', 'Volume at 3rd repeat MRI', 'PSAd at 3rd repeat MRI', 'Days since baseline MRI.2']
feat_mri += ['Date of 4th repeat MRI', 'PRECISE scoring.3', 'Stage at 4th repeat MRI', 'Volume at 4th repeat MRI', 'PSAd at 4th repeat MRI', 'Days since baseline MRI.3']
feat_mri += ['Date of 5th repeat MRI', 'PRECISE scoring.4', 'Stage at 5th repeat MRI', 'Volume at 5th repeat MRI', 'PSAd at 5th repeat MRI', 'Days since baseline MRI.4']
feat_mri += ['Date of 6th repeat MRI', 'PRECISE scoring.5', 'Stage at 6th repeat MRI', 'Volume at 6th repeat MRI', 'PSAd at 6th repeat MRI', 'Days since baseline MRI.5']
feat_mri += ['Date of 7th repeat MRI', 'PRECISE scoring.6', 'Stage at 7th repeat MRI', 'Volume at 7th repeat MRI', 'PSAd at 7th repeat MRI', 'Days since baseline MRI.6']
feat_mri += ['Date of 8th repeat MRI', 'PRECISE scoring.7', 'Stage at 8th repeat MRI', 'Volume at 8th repeat MRI', 'PSAd at 8th repeat MRI', 'Days since baseline MRI.7']

df_new    = df[feat_baseline + feat_label + feat_psa + feat_biopsy + feat_mri]

In [10]:
feat_psa_new = []
for i in range(45):
    count = i + 1
    feat_psa_new += ['Repeat PSA {}'.format(count), 'Repeat PSA {} Date'.format(count), 'Repeat PSA {} Days Since Diagnosis'.format(count)]
    
feat_biopsy_new = []
for i in range(5):
    count = i+1
    feat_biopsy_new += [
        'Repeat Biopsy {} Core Total'.format(count), 'Repeat Biopsy {} Core Positive'.format(count), 'Repeat Biopsy {} Primary Gleason'.format(count), 
        'Repeat Biopsy {} Secondary Gleason'.format(count), 'Repeat Biopsy {} Grade Group'.format(count), 
        'Repeat Biopsy {} Date'.format(count), 'Repeat Biopsy {} Days Since Diagnosis'.format(count)
    ]
    
feat_mri_new = []
for i in range(8):
    count = i+1
    feat_mri_new += [
        'Repeat MRI {} Date'.format(count), 'Repeat MRI {} PRECISE Scoring'.format(count), 'Repeat MRI {} Stage'.format(count), 
        'Repeat MRI {} Volume'.format(count), 'Repeat MRI {} PSAd'.format(count), 'Repeat MRI {} Days Since Diagnosis'.format(count)
    ]
    
df_new.columns = feat_baseline + feat_label + feat_psa_new + feat_biopsy_new + feat_mri_new

In [11]:
for i in range(45):
    count = i + 1
    if count > 1:
        df_new['Repeat PSA {} Date'.format(count)] = df_new['Repeat PSA {} Date'.format(count)].apply(lambda x: datetime.strptime(x, '%d/%m/%Y') if not pd.isna(x) else np.nan)
    df_new['Repeat PSA {} Days Since Diagnosis'.format(count)] = df_new.apply(lambda x: (x.loc['Repeat PSA {} Date'.format(count)] - x.loc['Date of diagnosis']).days if not pd.isna(x['Repeat PSA {} Date'.format(count)]) else np.nan, axis=1)
        
for i in range(5):
    count = i + 1
    if count > 1:
        df_new['Repeat Biopsy {} Date'.format(count)] = df_new['Repeat Biopsy {} Date'.format(count)].apply(lambda x: datetime.strptime(x, '%d/%m/%Y') if not pd.isna(x) else np.nan)
    df_new['Repeat Biopsy {} Days Since Diagnosis'.format(count)] = df_new.apply(lambda x: (x.loc['Repeat Biopsy {} Date'.format(count)] - x.loc['Date of diagnosis']).days if not pd.isna(x['Repeat Biopsy {} Date'.format(count)]) else np.nan, axis=1)
        
for i in range(8):
    count = i + 1
    if count > 1:
        df_new['Repeat MRI {} Date'.format(count)] = df_new['Repeat MRI {} Date'.format(count)].apply(lambda x: datetime.strptime(x, '%d/%m/%Y') if not pd.isna(x) else np.nan)
    df_new['Repeat MRI {} Days Since Diagnosis'.format(count)] = df_new.apply(lambda x: (x.loc['Repeat MRI {} Date'.format(count)] - x.loc['Date of diagnosis']).days if not pd.isna(x['Repeat MRI {} Date'.format(count)]) else np.nan, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] 

In [12]:
'''
    Update dignosis PSA with the latest one.

'''
idx1 = np.where(df_new['Repeat PSA 1 Days Since Diagnosis'] < 0)[0]
for pid in idx1:
    df_new.loc[pid, 'PSA at diagnosis'] = df_new.loc[pid, 'Repeat PSA 1']
    
idx1 = np.where(df_new['Repeat PSA 2 Days Since Diagnosis'] < 0)[0]
for pid in idx1:
    df_new.loc[pid, 'PSA at diagnosis'] = df_new.loc[pid, 'Repeat PSA 2']
    
'''
    Update diagnosis Biopsy with the latest one. (After the daignosis date is adjusted)
        - Replace with the most recent Biopsy 
        - If the recent biopsy is "NC" or "HoLEP" discard it.
'''
for pid in [4, 177, 248, 362, 383, 510]:
    df_new.loc[pid, 'Total number of biopsy cores used at diagnosis'] = df_new.loc[pid, 'Repeat Biopsy 1 Core Total']
    df_new.loc[pid, 'Number of positive biospy cores'] = df_new.loc[pid, 'Repeat Biopsy 1 Core Positive']
    df_new.loc[pid, 'Primary Gleason'] = df_new.loc[pid, 'Repeat Biopsy 1 Primary Gleason']
    df_new.loc[pid, 'Secondary Gleason'] = df_new.loc[pid, 'Repeat Biopsy 1 Secondary Gleason']
    df_new.loc[pid, 'Grade group'] = df_new.loc[pid, 'Repeat Biopsy 1 Grade Group']
    df_new.loc[pid, 'Core positivity'] = float(df_new.loc[pid, 'Number of positive biospy cores'])/float(df_new.loc[pid, 'Total number of biopsy cores used at diagnosis'])

    if pid in [4, 177]:
        df_new.loc[pid, 'CPG'] = 2

        
'''
    Ignore MRI measurements.. 
    New ID 104, 263...
'''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


'\n    Ignore MRI measurements.. \n    New ID 104, 263...\n'

In [13]:
df_psa = pd.DataFrame([], columns=['New ID'] + ['Repeat PSA', 'Repeat PSA Date', 'Repeat PSA Days Since Diagnosis'])

for i in range(45):
    count = i + 1
    tmp_feat = ['Repeat PSA {}'.format(count), 'Repeat PSA {} Date'.format(count), 'Repeat PSA {} Days Since Diagnosis'.format(count)]
    tmp_df   = df_new[['New ID'] + tmp_feat]
    tmp_df.columns = ['New ID'] + ['Repeat PSA', 'Repeat PSA Date', 'Repeat PSA Days Since Diagnosis']
    
    df_psa = pd.concat([df_psa, tmp_df], axis=0)
    
df_psa = df_psa.sort_values(by=['New ID', 'Repeat PSA Days Since Diagnosis']).reset_index(drop=True)
df_psa = df_psa[df_psa['Repeat PSA Days Since Diagnosis'] > 0].reset_index(drop=True)

In [14]:
feat_list = ['New ID'] + ['Repeat Biopsy Core Total', 'Repeat Biopsy Core Positive', 'Repeat Biopsy Primary Gleason', 
        'Repeat Biopsy Secondary Gleason', 'Repeat Biopsy Grade Group', 'Repeat Biopsy Date', 'Repeat Biopsy Days Since Diagnosis']

df_biopsy = pd.DataFrame([], columns= feat_list)

for i in range(5):
    count = i + 1
    tmp_feat = [
        'Repeat Biopsy {} Core Total'.format(count), 'Repeat Biopsy {} Core Positive'.format(count), 'Repeat Biopsy {} Primary Gleason'.format(count), 
        'Repeat Biopsy {} Secondary Gleason'.format(count), 'Repeat Biopsy {} Grade Group'.format(count), 
        'Repeat Biopsy {} Date'.format(count), 'Repeat Biopsy {} Days Since Diagnosis'.format(count)
    ]
    tmp_df   = df_new[['New ID'] + tmp_feat]
    tmp_df.columns = feat_list
    
    df_biopsy = pd.concat([df_biopsy, tmp_df], axis=0)
    
df_biopsy = df_biopsy.sort_values(by=['New ID', 'Repeat Biopsy Days Since Diagnosis'])
df_biopsy = df_biopsy[df_biopsy['Repeat Biopsy Days Since Diagnosis'] > 0].reset_index(drop=True)

In [15]:
feat_list = ['New ID'] + ['Repeat MRI PRECISE Scoring', 'Repeat MRI Stage', 
        'Repeat MRI Volume', 'Repeat MRI PSAd', 'Repeat MRI Date',  'Repeat MRI Days Since Diagnosis']

df_mri = pd.DataFrame([], columns= feat_list)

for i in range(5):
    count = i + 1
    tmp_feat =  [
        'Repeat MRI {} PRECISE Scoring'.format(count), 'Repeat MRI {} Stage'.format(count), 'Repeat MRI {} Volume'.format(count), 
        'Repeat MRI {} PSAd'.format(count), 'Repeat MRI {} Date'.format(count), 'Repeat MRI {} Days Since Diagnosis'.format(count)
    ]
    
    tmp_df   = df_new[['New ID'] + tmp_feat]
    tmp_df.columns = feat_list
    
    df_mri = pd.concat([df_mri, tmp_df], axis=0)
    
df_mri = df_mri.sort_values(by=['New ID', 'Repeat MRI Days Since Diagnosis'])
df_mri = df_mri[df_mri['Repeat MRI Days Since Diagnosis'] > 0].reset_index(drop=True)    

In [16]:
df_new['Days since diagnosis.3'] = df_new.apply(lambda x: (x['Date of reaching endpoint OR last investigation date if censored'] - x['Date of diagnosis']).days, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [17]:
list(df_new)

feat_id  = [
    'New ID'
]
feat_label    = ['CPG3 Outcome', 'Coding.3', 'Days since diagnosis.3']

feat_baseline_con = [
    'Exact age at diagnosis',
    'Number of negative biopsies before diagnosis',
    'PSA at diagnosis',
    'Prostatic volume at diagnosis',
    'PSA density at diagnosis',
    'Total number of biopsy cores used at diagnosis', #some non-integer values should be changed
    'Number of positive biospy cores',  #some non-integer values should be changed
    'Core positivity',   
    'Number of MRI-visible lesions'
]

## nominalcategorical variables --> treat each number as category; no assumption made
feat_baseline_cat1 = [
    'Ethnicity',
    'Family History of Prostate Cancer',
    'Stage (highest of clinical or MRI stage)',
]

## ordinal categorical variables --> treat each number as continuous; but imputation is done as categories.
feat_baseline_cat2 = [
    # I treat them as norminal variables (one main reason is that it only contains 2 groups...)
    'Primary Gleason',    #should be removed since all 3
    'Secondary Gleason',  #since this is ordinal, i presume it is okay to leave it as continuous
    'Grade group',      #this should be removed since all equivalent to 'Secondary Gleason'
    'CPG',
    
    'PI-RADS score',  #treat these categorical variables as ordinal
    'STRATCANS (simplified)', #treat these categorical variables as ordinal    
]


'''
### Ethnicity
    White British         513
    Other White            12
    White Irish             5
    Other Chinese           2
    Other Asian             1
    Black Caribbean         1
    Mixed White-Asian       1
    Asian Indian            1
    Black African           1
    Other Black             1
    Asian Pakistani         1
    Other Ethnic Group      1
    Asian Bangladeshi       1 
    nan                    47
    --> all categories other than White Britsh will be 0. 
    --> But do consider to remove this feature out.
    
### Family History of Prostate Cancer
    No     373
    Yes     81
    no       6
    nan    128
    
### Stage
    T1c    267   -> T1
    T2     264   -> T2
    T1a     30   -> T1
    T2a     17   -> T2
    T1b      5   -> T2
    T2b      2   -> T2
    t2       2   -> T2
    T2c      1   -> T2
    nan      0
'''

for feat in feat_baseline_con:
    df_new[feat] = df_new[feat].astype(float)


feat = 'Ethnicity'
df_new.loc[pd.isna(df_new[feat]), feat] = 'White British'
df_new[feat] = df_new[feat].apply(lambda x: 1. if x == 'White British' else 0.)

feat = 'Family History of Prostate Cancer'
df_new.loc[pd.isna(df_new[feat]), feat] = 'No'
df_new[feat] = df_new[feat].apply(lambda x: 0. if str(x).lower() == 'no' else 1.)


feat = 'Stage (highest of clinical or MRI stage)'

mapper_stage = {
    'T1a': 0.,
    'T1b': 0.,
    'T1c': 0., 
    't2' : 1., 
    'T2': 1.,    
    'T2a': 1.,     
    'T2b': 1.,
    'T2c': 1.,
    'T3' : 2.,
    'T3a': 2.,
    'T4a': 2.
}
df_new[feat] = df_new[feat].apply(lambda x: mapper_stage[x] )

feat = 'Primary Gleason'
df_new[feat] = df_new[feat].apply(lambda x: float(x))

feat = 'Secondary Gleason'
df_new[feat] = df_new[feat].apply(lambda x: float(x))

feat = 'CPG'
df_new[feat] = df_new[feat].apply(lambda x: float(x))

feat = 'Grade group'
df_new[feat] = df_new[feat].apply(lambda x: float(x))
                    
                     
mapper_cpg_out = {
    'CPG3 (biopsy)':1.,
    'CPG3 (2 PSAs)':1., 
    'CPG3 (biopsy + PSAs)':1.,
    'CPG4 (biopy)':2., 
    'CPG4 (MRI)':2., 
    'CPG4 (2 PSAs)':2., 
    'CPG4 (biopsy)':2., 
    'CPG4 (MRI + 2 PSAs)':2.,
    'CPG5 (biopsy)':3.,  
    'Follow up':0., 
    'Treatment':4.,
    'Death':5.
}

df_new['CPG3 Outcome'] = df_new['CPG3 Outcome'].apply(lambda x: mapper_cpg_out[x] )


df_baseline = df_new[feat_id + feat_baseline_con + feat_baseline_cat1 + feat_baseline_cat2 + feat_label]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/p

In [18]:
'''
    'MRI 1,...8 Stage'
    T2     631
    T1c     25
    T3a     25
    T2c      2
    T2b      2
    T2a      1
    T3       1
    T4a      1
    
    nan    -> T2
    --> lump T3 T3a T4a together = 2
        
    PRECISE Scoring
    Leave it as orginal categories.
'''

feat = 'Repeat MRI Stage'
df_mri[feat] = df_mri[feat].apply(lambda x: mapper_stage[x] if x is not np.nan else np.nan )

In [19]:
### Put np.nan when Primary Gleason = 'NC' 
### Put np.nan when Core Positive = 0
df_biopsy['Repeat Biopsy Core Total'] = df_biopsy['Repeat Biopsy Core Total'].apply(lambda x: np.nan if x in ['HoLEP', 'TURP'] else x)
df_biopsy['Repeat Biopsy Core Total'] = df_biopsy['Repeat Biopsy Core Total'].astype(float)

df_biopsy.loc[df_biopsy['Repeat Biopsy Primary Gleason'] == 'NC', 'Repeat Biopsy Primary Gleason'] = np.nan
df_biopsy['Repeat Biopsy Primary Gleason'] = df_biopsy['Repeat Biopsy Primary Gleason'].astype(float)
df_biopsy.loc[df_biopsy['Repeat Biopsy Primary Gleason'] == 0, 'Repeat Biopsy Primary Gleason'] = np.nan

df_biopsy.loc[df_biopsy['Repeat Biopsy Secondary Gleason'] == 'NC', 'Repeat Biopsy Secondary Gleason'] = np.nan
df_biopsy['Repeat Biopsy Secondary Gleason'] = df_biopsy['Repeat Biopsy Secondary Gleason'].astype(float)
df_biopsy.loc[df_biopsy['Repeat Biopsy Secondary Gleason'] == 0, 'Repeat Biopsy Secondary Gleason'] = np.nan

df_biopsy.loc[df_biopsy['Repeat Biopsy Grade Group'] == 'NC', 'Repeat Biopsy Grade Group'] = np.nan
df_biopsy['Repeat Biopsy Grade Group'] = df_biopsy['Repeat Biopsy Grade Group'].astype(float)
df_biopsy.loc[df_biopsy['Repeat Biopsy Grade Group'] == 0, 'Repeat Biopsy Grade Group'] = np.nan

In [20]:
df_psa = df_psa.drop(columns=['Repeat PSA Date'])
df_biopsy = df_biopsy.drop(columns=['Repeat Biopsy Date'])
df_mri = df_mri.drop(columns=['Repeat MRI Date'])

df_psa = df_psa.rename(columns={'Repeat PSA Days Since Diagnosis': 'Days Since Diagnosis'})
df_biopsy = df_biopsy.rename(columns={'Repeat Biopsy Days Since Diagnosis': 'Days Since Diagnosis'})
df_mri = df_mri.rename(columns={'Repeat MRI Days Since Diagnosis': 'Days Since Diagnosis'})

In [21]:
### REMOVE MEASUREMENTS AFTER TTE! (Y == 1: t >= tte removed, Y == 0: t > tte removed)

### PSA
id_list = df_psa['New ID'].unique()
grouped = df_psa.groupby(by='New ID')

for i, pid in enumerate(id_list):
    tmp = grouped.get_group(pid)
    tmp_tte = df_baseline.loc[df_baseline['New ID'] == pid, 'Days since diagnosis.3'].values[0]
    tmp_y   = df_baseline.loc[df_baseline['New ID'] == pid, 'Coding.3'].values[0]
    
    if tmp_y == 0:
        tmp = tmp[tmp['Days Since Diagnosis'] <= tmp_tte]
    else: #tmp_y == 1
        tmp = tmp[tmp['Days Since Diagnosis'] < tmp_tte]
    
    if i == 0:
        df_psa_new = tmp
    else:
        df_psa_new = pd.concat([df_psa_new, tmp], axis=0)
        
### BIOPSY
id_list = df_biopsy['New ID'].unique()
grouped = df_biopsy.groupby(by='New ID')

for i, pid in enumerate(id_list):
    tmp = grouped.get_group(pid)
    tmp_tte = df_baseline.loc[df_baseline['New ID'] == pid, 'Days since diagnosis.3'].values[0]
    tmp_y   = df_baseline.loc[df_baseline['New ID'] == pid, 'Coding.3'].values[0]
    
    if tmp_y == 0:
        tmp = tmp[tmp['Days Since Diagnosis'] <= tmp_tte]
    else: #tmp_y == 1
        tmp = tmp[tmp['Days Since Diagnosis'] < tmp_tte]
    
    if i == 0:
        df_biopsy_new = tmp
    else:
        df_biopsy_new = pd.concat([df_biopsy_new, tmp], axis=0)
        
### MRI
id_list = df_mri['New ID'].unique()
grouped = df_mri.groupby(by='New ID')

for i, pid in enumerate(id_list):
    tmp = grouped.get_group(pid)
    tmp_tte = df_baseline.loc[df_baseline['New ID'] == pid, 'Days since diagnosis.3'].values[0]
    tmp_y   = df_baseline.loc[df_baseline['New ID'] == pid, 'Coding.3'].values[0]
    
    if tmp_y == 0:
        tmp = tmp[tmp['Days Since Diagnosis'] <= tmp_tte]
    else: #tmp_y == 1
        tmp = tmp[tmp['Days Since Diagnosis'] < tmp_tte]
    
    if i == 0:
        df_mri_new = tmp
    else:
        df_mri_new = pd.concat([df_mri_new, tmp], axis=0)

In [22]:
### ADD BASELINE INTO TEMPORAL FEATURES
## PSA
tmp = df_baseline[['New ID', 'PSA at diagnosis']]
tmp['Days Since Diagnosis'] = 0.
tmp.columns = list(df_psa_new)

df_psa_new = pd.concat([tmp, df_psa_new], axis=0).sort_values(by=['New ID', 'Days Since Diagnosis']).reset_index(drop=True)

## BIOPSY
tmp = df_baseline[['New ID', 'Total number of biopsy cores used at diagnosis', 'Number of positive biospy cores', 'Primary Gleason', 'Secondary Gleason', 'Grade group']]
tmp['Days Since Diagnosis'] = 0.
tmp.columns = list(df_biopsy_new)
df_biopsy_new = pd.concat([tmp, df_biopsy_new], axis=0).sort_values(by=['New ID', 'Days Since Diagnosis']).reset_index(drop=True)


## MRI
tmp = df_baseline[['New ID', 'Stage (highest of clinical or MRI stage)', 'Prostatic volume at diagnosis', 'PSA density at diagnosis']]
tmp['Repeat MRI PRECISE Scoring'] = np.nan
tmp['Days Since Diagnosis'] = 0.
tmp = tmp[['New ID', 'Repeat MRI PRECISE Scoring', 'Stage (highest of clinical or MRI stage)', 'Prostatic volume at diagnosis', 'PSA density at diagnosis', 'Days Since Diagnosis']]
tmp.columns = list(df_mri_new)
df_mri_new = pd.concat([tmp, df_mri_new], axis=0).sort_values(by=['New ID', 'Days Since Diagnosis']).reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = valu

In [23]:
df_psa_new    = df_psa_new.drop_duplicates(subset=['New ID', 'Days Since Diagnosis'], keep='last').reset_index(drop=True)
df_mri_new    = df_mri_new.drop_duplicates(subset=['New ID', 'Days Since Diagnosis'], keep='last').reset_index(drop=True)
df_biopsy_new = df_biopsy_new.drop_duplicates(subset=['New ID', 'Days Since Diagnosis'], keep='last').reset_index(drop=True)

In [24]:
feat_baseline = [
    'Exact age at diagnosis',
    'Number of negative biopsies before diagnosis',
    'Number of MRI-visible lesions',
    'Ethnicity',
    'Family History of Prostate Cancer',
    'CPG',    
    'PI-RADS score',  #treat these categorical variables as ordinal
    'STRATCANS (simplified)', #treat these categorical variables as ordinal    
]
feat_label    = ['CPG3 Outcome', 'Coding.3', 'Days since diagnosis.3']

df_baseline = df_baseline[['New ID'] + feat_baseline + feat_label]

In [25]:
df_baseline   = df_baseline[df_baseline['Days since diagnosis.3'] > 0].reset_index(drop=True)

In [26]:
df_psa_new    = df_psa_new[df_psa_new['New ID'].apply(lambda x: x in df_baseline['New ID'].tolist())].reset_index(drop=True)
df_biopsy_new = df_biopsy_new[df_biopsy_new['New ID'].apply(lambda x: x in df_baseline['New ID'].tolist())].reset_index(drop=True)
df_mri_new    = df_mri_new[df_mri_new['New ID'].apply(lambda x: x in df_baseline['New ID'].tolist())].reset_index(drop=True)

In [27]:
df_psa_new.to_csv('./data/processed/before imputation/repeat_psa.csv', index=0)
df_biopsy_new.to_csv('./data/processed/before imputation/repeat_biopsy.csv', index=0)
df_mri_new.to_csv('./data/processed/before imputation/repeat_mri.csv', index=0)

df_baseline.to_csv('./data/processed/before imputation/baseline.csv', index=0)

# IMPUTATION

In [28]:
## STAGE --> categorical! (one hot value)
## Gleason Scores & Grade group --> orginal variable

## Last observation carried forward!! 
## Static: mean + median imputation.

df_baseline = pd.read_csv('./data/processed/before imputation/baseline.csv')
df_psa      = pd.read_csv('./data/processed/before imputation/repeat_psa.csv')
df_biopsy   = pd.read_csv('./data/processed/before imputation/repeat_biopsy.csv')
df_mri      = pd.read_csv('./data/processed/before imputation/repeat_mri.csv')

In [29]:
first = df_psa.drop_duplicates(subset=['New ID',], keep='first')[['Days Since Diagnosis']]
diff  = df_psa['Days Since Diagnosis'].diff()

indx = [i for i in diff.index if i not in first.index]
print(np.quantile(diff[indx], 0.25))
print(np.quantile(diff[indx], 0.50))
print(np.quantile(diff[indx], 0.75))


first = df_biopsy.drop_duplicates(subset=['New ID',], keep='first')[['Days Since Diagnosis']]
diff  = df_biopsy['Days Since Diagnosis'].diff()

indx = [i for i in diff.index if i not in first.index]
print(np.quantile(diff[indx], 0.25))
print(np.quantile(diff[indx], 0.50))
print(np.quantile(diff[indx], 0.75))


first = df_mri.drop_duplicates(subset=['New ID',], keep='first')[['Days Since Diagnosis']]
diff  = df_mri['Days Since Diagnosis'].diff()

indx = [i for i in diff.index if i not in first.index]
print(np.quantile(diff[indx], 0.25))
print(np.quantile(diff[indx], 0.50))
print(np.quantile(diff[indx], 0.75))


84.0
102.0
182.0
157.25
614.0
1085.5
378.0
462.5
662.0


In [30]:
def count_nans(df, txt=None):
    if txt is not None:
        print(txt, "N nans:", df.isnull().sum().sum())
    else:
        print("N nans:", df.isnull().sum().sum())

def pour_array_into_df(df, array):
    # Put data from `array` into `df`, preserving the `df`'s original datatypes.
    for idx, c in enumerate(df.columns):
        dtype = df.dtypes[idx]
        df[c] = array[:, idx].astype(dtype)
    # count_nans(df)
    return df

def mice_impute(df, ignore_cols, categorical_feats, random_state=0):
    cols_used = [c for c in df.columns if c not in ignore_cols]
    df_imp = df[cols_used].copy()
    # display(df_imp)
    d = df_imp.to_numpy()

    min_max_recorded = dict()
    for c in categorical_feats:
        min_max_recorded[c] = df_imp[c].min(), df_imp[c].max()
    #print(min_max_recorded)

    imp = IterativeImputer(max_iter=10, random_state=random_state)
    d = imp.fit_transform(d)
    df_imp = pour_array_into_df(df_imp, d)
    for c in categorical_feats:
        # Round to int.
        df_imp[c] = np.rint(df_imp[c].to_numpy())
        # Clip (so that we don't get non-existent categorical values).
        min_, max_ = min_max_recorded[c]
        df_imp.loc[df_imp[c] < min_, c] = min_
        df_imp.loc[df_imp[c] > max_, c] = max_
    # count_nans(df_imp)
    
    # Transform back into original df.
    df_out = df.copy()
    for c in cols_used:
        df_out.loc[:, c] = df_imp.loc[:, c]
    # display(df_out)
    
    # count_nans(df_out)
    return df_out

In [31]:
feat_baseline_con = ['Exact age at diagnosis', 'Number of negative biopsies before diagnosis', 'Number of MRI-visible lesions']
feat_baseline_cat = ['Ethnicity', 'Family History of Prostate Cancer', 'CPG', 'PI-RADS score', 'STRATCANS (simplified)']

In [32]:
### BASELINE
df_baseline_imputed = df_baseline.copy()

for feat in feat_baseline_con:
    df_baseline_imputed.loc[df_baseline_imputed[feat].isna(), feat] = df_baseline_imputed.loc[df_baseline_imputed[feat].notna(), feat].mean()
    
for feat in feat_baseline_cat:
    df_baseline_imputed.loc[df_baseline_imputed[feat].isna(), feat] = df_baseline_imputed.loc[df_baseline_imputed[feat].notna(), feat].mode()[0]

# display(df_baseline_imputed)
# df_baseline_imputed.info()
# count_nans(df_baseline_imputed)

df_baseline_imputed.to_csv('./data/processed/after imputation/baseline.csv', index=0)

In [33]:
### BASELINE (MICE)

count_nans(df_baseline, "before")
df_baseline_imputed = mice_impute(df=df_baseline, ignore_cols=["New ID"], categorical_feats=feat_baseline_cat, random_state=0)
count_nans(df_baseline_imputed, "after")

df_baseline_imputed.to_csv('./data/processed/after imputation (MICE)/baseline.csv', index=0)

before N nans: 168
after N nans: 0


In [34]:
### PSA (no missingness)
grouped = df_psa.groupby(by='New ID')
df_psa_new = df_psa.copy(deep=True)

count_nans(df_psa)
assert df_psa.isnull().sum().sum() == 0

impute_psa = pd.DataFrame([], index=list(df_psa)[1:-1])
impute_psa['mean'] = df_psa.mean()
impute_psa['mode'] = np.nan

df_psa_new.to_csv('./data/processed/after imputation/repeat_psa.csv', index=0)
df_psa_new.to_csv('./data/processed/after imputation (MICE)/repeat_psa.csv', index=0)

N nans: 0


In [35]:
### Biopsy
id_list = df_biopsy['New ID'].unique()
grouped = df_biopsy.groupby(by='New ID')

for i, pid in enumerate(id_list):
    tmp = grouped.get_group(pid)
    tmp = tmp.fillna(method='ffill')
    
    if i == 0:
        df_biopsy_new = tmp
    else:
        df_biopsy_new = pd.concat([df_biopsy_new, tmp], axis=0)
df_biopsy_new  = df_biopsy_new.reset_index(drop=True)

impute_biopsy = pd.DataFrame([], index=list(df_biopsy)[1:-1])
impute_biopsy['mean'] = df_biopsy.mean()
impute_biopsy['mode'] = np.nan
for feat in list(df_biopsy)[1:-1]:
    impute_biopsy.loc[feat, 'mode'] = df_biopsy[feat].mode()[0]
    
for feat in list(df_biopsy)[1:-1]:
    if feat in ['Repeat Biopsy Primary Gleason', 'Repeat Biopsy Secondary Gleason', 'Repeat Biopsy Grade Group']:
        df_biopsy_new.loc[df_biopsy_new[feat].isna(), feat] = impute_biopsy.loc[feat, 'mode']
    else:
        df_biopsy_new.loc[df_biopsy_new[feat].isna(), feat] = impute_biopsy.loc[feat, 'mean']

df_biopsy_new.to_csv('./data/processed/after imputation/repeat_biopsy.csv', index=0)
if not APPLY_MICE_TO_TEMPORAL:
    df_biopsy_new.to_csv('./data/processed/after imputation (MICE)/repeat_biopsy.csv', index=0)

In [36]:
### Biopsy (MICE)
# NOTE: For temporal MICE, simply treat each (example, time-step) as "example".

if APPLY_MICE_TO_TEMPORAL:
    
    if FFILL_FIRST_WHEN_APPLY_MICE_TO_TEMPORAL:
        id_list = df_biopsy['New ID'].unique()
        grouped = df_biopsy.groupby(by='New ID')
        for i, pid in enumerate(id_list):
            tmp = grouped.get_group(pid)
            tmp = tmp.fillna(method='ffill')

            if i == 0:
                df_biopsy_new = tmp
            else:
                df_biopsy_new = pd.concat([df_biopsy_new, tmp], axis=0)
        df_biopsy_new  = df_biopsy_new.reset_index(drop=True)
    else:
        df_biopsy_new = df_biopsy.copy()
    
    count_nans(df_biopsy, "before")
    df_biopsy_new = mice_impute(
        df=df_biopsy_new, 
        ignore_cols=["New ID"], 
        categorical_feats=['Repeat Biopsy Primary Gleason', 'Repeat Biopsy Secondary Gleason', 'Repeat Biopsy Grade Group'], 
        random_state=0
    )
    count_nans(df_biopsy_new, "after")
    
    df_biopsy_new.to_csv('./data/processed/after imputation (MICE)/repeat_biopsy.csv', index=0)

before N nans: 803
after N nans: 0


In [37]:
### MRI
id_list = df_mri['New ID'].unique()
grouped = df_mri.groupby(by='New ID')

for i, pid in enumerate(id_list):
    tmp = grouped.get_group(pid)
    tmp = tmp.fillna(method='ffill')
    
    if i == 0:
        df_mri_new = tmp
    else:
        df_mri_new = pd.concat([df_mri_new, tmp], axis=0)
df_mri_new  = df_mri_new.reset_index(drop=True)

impute_mri = pd.DataFrame([], index=list(df_mri)[1:-1])
impute_mri['mean'] = df_mri.mean()
impute_mri['mode'] = np.nan
for feat in list(df_mri)[1:-1]:
    impute_mri.loc[feat, 'mode'] = df_mri[feat].mode()[0]
    
for feat in list(df_mri_new)[1:-1]:
    if feat in ['Repeat MRI Stage', 'Repeat MRI PRECISE Scoring']:
        df_mri_new.loc[df_mri_new[feat].isna(), feat] = impute_mri.loc[feat, 'mode']
    else:
        df_mri_new.loc[df_mri_new[feat].isna(), feat] = impute_mri.loc[feat, 'mean']

df_mri_new.to_csv('./data/processed/after imputation/repeat_mri.csv', index=0)
if not APPLY_MICE_TO_TEMPORAL:
    df_mri_new.to_csv('./data/processed/after imputation (MICE)/repeat_mri.csv', index=0)

In [38]:
### MRI (MICE)

if APPLY_MICE_TO_TEMPORAL:
    
    if FFILL_FIRST_WHEN_APPLY_MICE_TO_TEMPORAL:
        id_list = df_mri['New ID'].unique()
        grouped = df_mri.groupby(by='New ID')
        for i, pid in enumerate(id_list):
            tmp = grouped.get_group(pid)
            tmp = tmp.fillna(method='ffill')

            if i == 0:
                df_mri_new = tmp
            else:
                df_mri_new = pd.concat([df_mri_new, tmp], axis=0)
        df_mri_new  = df_mri_new.reset_index(drop=True)
    else:
        df_mri_new = df_mri.copy()
    
    count_nans(df_mri, "before")
    df_mri_new = mice_impute(
        df=df_mri_new, 
        ignore_cols=["New ID"], 
        categorical_feats=['Repeat MRI Stage', 'Repeat MRI PRECISE Scoring'], 
        random_state=0
    )
    count_nans(df_mri_new, "after")
    
    df_biopsy_new.to_csv('./data/processed/after imputation (MICE)/repeat_mri.csv', index=0)

before N nans: 2841
after N nans: 0


### CONCATENATE

In [39]:
df_temporal = pd.concat([df_psa[['New ID', 'Days Since Diagnosis']], df_biopsy[['New ID', 'Days Since Diagnosis']], df_mri[['New ID', 'Days Since Diagnosis']]], axis=0)
df_temporal = df_temporal.drop_duplicates(subset=['New ID', 'Days Since Diagnosis'])
df_temporal = df_temporal.sort_values(by=['New ID', 'Days Since Diagnosis']).reset_index(drop=True)

df_temporal = pd.merge(df_temporal, df_psa, how='left', on=['New ID', 'Days Since Diagnosis'])
df_temporal = pd.merge(df_temporal, df_biopsy, how='left', on=['New ID', 'Days Since Diagnosis'])
df_temporal = pd.merge(df_temporal, df_mri, how='left', on=['New ID', 'Days Since Diagnosis'])

count_nans(df_temporal)
df_temporal.to_csv('./data/processed/before imputation/temporal.csv', index=0)

N nans: 74145


In [40]:
### TEMPORAL
id_list = df_temporal['New ID'].unique()
grouped = df_temporal.groupby(by='New ID')

for i, pid in enumerate(id_list):
    tmp = grouped.get_group(pid)
    tmp = tmp.fillna(method='ffill')
    
    if i == 0:
        df_temporal_new = tmp
    else:
        df_temporal_new = pd.concat([df_temporal_new, tmp], axis=0)
df_temporal_new  = df_temporal_new.reset_index(drop=True)

# Debug code:
# count_nans(df_temporal_new, "after ffill")
# indices_where_missing = np.where(df_temporal_new.isnull().to_numpy())
# print(indices_where_missing)
# print(indices_where_missing[0].sum(), indices_where_missing[1].sum())

feat = 'Repeat PSA'
df_temporal_new.loc[df_temporal_new[feat].isna(), feat] = impute_psa.loc[feat, 'mean']
        
for feat in list(df_biopsy)[1:-1]:
    if feat in ['Repeat Biopsy Primary Gleason', 'Repeat Biopsy Secondary Gleason', 'Repeat Biopsy Grade Group']:
        df_temporal_new.loc[df_temporal_new[feat].isna(), feat] = impute_biopsy.loc[feat, 'mode']
    else:
        df_temporal_new.loc[df_temporal_new[feat].isna(), feat] = impute_biopsy.loc[feat, 'mean']
        
for feat in list(df_mri_new)[1:-1]:
    if feat in ['Repeat MRI Stage', 'Repeat MRI PRECISE Scoring']:
        df_temporal_new.loc[df_temporal_new[feat].isna(), feat] = impute_mri.loc[feat, 'mode']
    else:
        df_temporal_new.loc[df_temporal_new[feat].isna(), feat] = impute_mri.loc[feat, 'mean']

count_nans(df_temporal_new)

# Debug code:
# df_temporal_new
# dcheck = df_temporal_new.copy()
# df_temporal_mean_mode = df_temporal_new.copy()

N nans: 0


In [41]:
df_temporal_new.to_csv('./data/processed/after imputation/temporal.csv', index=0)
if not APPLY_MICE_TO_TEMPORAL:
    df_temporal_new.to_csv('./data/processed/after imputation (MICE)/temporal.csv', index=0)

In [42]:
### TEMPORAL (MICE)

if APPLY_MICE_TO_TEMPORAL:
    
    count_nans(df_temporal, "before")
    
    if FFILL_FIRST_WHEN_APPLY_MICE_TO_TEMPORAL:
        id_list = df_temporal['New ID'].unique()
        grouped = df_temporal.groupby(by='New ID')
        for i, pid in enumerate(id_list):
            tmp = grouped.get_group(pid)
            tmp = tmp.fillna(method='ffill')

            if i == 0:
                df_temporal_new = tmp
            else:
                df_temporal_new = pd.concat([df_temporal_new, tmp], axis=0)
        df_temporal_new  = df_temporal_new.reset_index(drop=True)
    else:
        df_temporal_new = df_temporal_new.copy()
    
    # Debug code:
    # count_nans(df_temporal_new, "after ffill")
    # was_df = df_temporal_new.copy()
    # were_missing = df_temporal_new.isnull()
    # indices_where_missing = np.where(df_temporal_new.isnull().to_numpy())
    # print(indices_where_missing)
    # print(indices_where_missing[0].sum(), indices_where_missing[1].sum())
    
    df_temporal_new = mice_impute(
        df=df_temporal_new, 
        ignore_cols=["New ID"], 
        categorical_feats=[
            'Repeat Biopsy Primary Gleason', 
            'Repeat Biopsy Secondary Gleason', 
            'Repeat Biopsy Grade Group',
            'Repeat MRI Stage', 
            'Repeat MRI PRECISE Scoring'
        ], 
        random_state=0
    )
    count_nans(df_temporal_new, "after")
    
    df_temporal_new.to_csv('./data/processed/after imputation (MICE)/temporal.csv', index=0)
    
    # Debug code:
    # check2 = np.where(~np.isclose(df_temporal_new.to_numpy(), dcheck.to_numpy()))
    # print(check2)
    # print(check2[0].sum(), check2[1].sum())
    # # ---
    # x = were_missing.sum(axis=1) > 0
    # y = ["New ID", "Days Since Diagnosis"] + list(np.array(list(were_missing.columns))[were_missing.sum(axis=0) > 0])
    # print(y)
    # display(df_temporal_new.loc[x, y])
    # display(was_df.loc[x, y])
    # display(df_temporal_mean_mode.loc[x, y])
    # # ---
    # diffs = df_temporal_mean_mode != df_temporal_new
    # print(diffs.sum())

before N nans: 74145
after N nans: 0
