In [1]:
import numpy as np
import pandas as pd

In [2]:
# Function that returns number of rows and count of unique PatientIDs for a dataframe. 
def row_ID(dataframe):
    row = dataframe.shape[0]
    ID = dataframe['PatientID'].nunique()
    return row, ID

1. Load the full cohort previously defined

In [4]:
cohort = pd.read_csv('../checkpoint_trial/full_cohort.csv')

In [4]:
cohort.sample(3)

Unnamed: 0,PatientID,LineName,StartDate
6180,F021711E6C907,chemo,2015-04-16
5,FC0B515A8EBD0,Pembrolizumab,2018-12-13
4650,F0CEBA43D13D1,chemo,2016-12-14


In [5]:
cohort_IDs = cohort['PatientID'].to_numpy()

In [6]:
len(cohort_IDs)

6461

2. Load the demographics file and clean it

In [6]:
demographics = pd.read_csv('../data/Demographics.csv')

In [7]:
demographics.sample(3)

Unnamed: 0,PatientID,BirthYear,Gender,Race,Ethnicity,State
738,F5F294BB93141,1938,M,White,Not Hispanic or Latino,
8459,F2B67BE6FF0B4,1954,M,White,,NH
2381,F7386E00A9758,1953,M,White,Not Hispanic or Latino,


In [11]:
#Filter for the patients in the cohort
demographics = demographics[demographics['PatientID'].isin(cohort_IDs)]

In [12]:
row_ID(demographics)

(6461, 6461)

In [13]:
demographics.sample(3)

Unnamed: 0,PatientID,BirthYear,Gender,Race,Ethnicity,State
12827,FCC3236621571,1934,M,Other Race,Not Hispanic or Latino,VA
8746,FC9D7C14923AA,1953,M,,,MI
1247,FD54F9540075F,1939,M,Black or African American,Not Hispanic or Latino,


Race:
The recommendation from Flatiron is to do the following:
This approach specifically addresses the nuance of “Hispanic or Latino” appearing as both a Race and Ethnicity value in Flatiron data, as detailed in the Race and Ethnicity Overview. In order to align with OMB Standards, Flatiron recommends treating “Hispanic or Latino” as an ethnicity, using the following logic:

-Identify patients with a Race value of “Hispanic or Latino”
-For these patients, recode Race to NULL and Ethnicity to “Hispanic or Latino”

The resulting dataset will remove all instances of “Hispanic or Latino” as a Race, leaving “White,” “Black or African American,” “Asian,” “Other Race,” and NULL as possible Race values. 

In [16]:
demographics['Race'].value_counts()

Race
White                        4561
Other Race                    822
Black or African American     306
Asian                          86
Hispanic or Latino              8
Name: count, dtype: int64

In [14]:
# If race value is 'Hispanic or Latino', code as NULL, otherwise value unchanged.
demographics['race'] = (
    np.where(demographics['Race'] == 'Hispanic or Latino', 'NULL', demographics['Race'])
)

In [13]:
# Missing race value will be recoded as NULL
demographics['race'] = demographics['race'].fillna('NULL')

In [15]:
demographics['race'].value_counts().sum()

np.int64(5783)

In [17]:
demographics['race'].value_counts()

race
White                        4561
Other Race                    822
Black or African American     306
Asian                          86
NULL                            8
Name: count, dtype: int64

Ethnicity:

In [18]:
demographics['Ethnicity'].value_counts()

Ethnicity
Not Hispanic or Latino    4896
Hispanic or Latino         246
Name: count, dtype: int64

In [19]:
# If race value is equal to 'Hispanic or Latino', code ethnicity as 'Hispanic or Latino', otherwise unchanged. 
demographics['ethnicity'] = (
    np.where(demographics['Race'] == 'Hispanic or Latino', 'hispanic_latino', demographics['Ethnicity'])
)

In [21]:
demographics['ethnicity'] = demographics['ethnicity'].fillna('NULL')

In [22]:
#recode into snake case
demographics['ethnicity'] = demographics['ethnicity'].replace({'Hispanic or Latino': 'hispanic_latino'})

In [23]:
#recode into snake case
demographics['ethnicity'] = demographics['ethnicity'].replace({'Not Hispanic or Latino': 'not_hispanic_latino'})

In [24]:
demographics['ethnicity'].value_counts()

ethnicity
not_hispanic_latino    4896
NULL                   1318
hispanic_latino         247
Name: count, dtype: int64

In [25]:
#delete race and ethnicity columns
demographics = demographics.drop(columns = ['Race', 'Ethnicity'])

In [26]:
demographics.sample(3)

Unnamed: 0,PatientID,BirthYear,Gender,State,race,ethnicity
8528,F61A21E951E87,1946,M,MI,White,not_hispanic_latino
10221,FB9A9E2A69639,1943,M,NY,White,not_hispanic_latino
6122,F84293B485BD1,1937,M,FL,,


Per Flatiron, it is recommended that race and ethnicity are combined into a single variable as follows: 
-Hispanic or Latino 
-Not Hispanic or Latino, White 
-Not Hispanic or Latino, Black or African American 
-Not Hispanic or Latino, Asian 
-Not Hispanic or Latino, Other Race 
-Not Hispanic or Latino, Unknown Race 
-Unknown  
But will defer creating this column, given that there is some complexity in deciding how to handle cases where either race or ethnicity is unknown, will defer for now, given that race is unlikely to be central to the question at hand.

Birthyear, convert into Age; use the date of first line start (index date for this study) to calculate age

In [27]:
enhanced_adv = pd.read_csv('../data/Enhanced_AdvUrothelial.csv')

In [28]:
demographics = pd.merge(demographics, cohort[['PatientID', 'StartDate']], on = 'PatientID')

In [29]:
demographics.sample(3)

Unnamed: 0,PatientID,BirthYear,Gender,State,race,ethnicity,StartDate
4385,FC71446E3C667,1948,M,MS,White,,2023-02-23
5885,FBE4A050AC7A9,1951,M,TN,White,not_hispanic_latino,2023-08-11
5737,FF7B3C32C5799,1940,M,TN,White,not_hispanic_latino,2011-02-08


In [32]:
#assess data type for StartDate (is it a date)
print(demographics['StartDate'].dtype)

object


In [33]:
#how many entries are missing values
print(demographics['StartDate'].isnull().sum())

0


In [34]:
#what unique types of data are in the StartDate
print(demographics['StartDate'].apply(type).unique())

[<class 'str'>]


In [36]:
#convert StartDate to a date variable
demographics['StartDate'] = pd.to_datetime(demographics['StartDate'], format="%Y-%m-%d")

In [37]:
print(demographics['StartDate'].dtype)

datetime64[ns]


In [38]:
demographics.loc[:, 'age'] = demographics['StartDate'].dt.year - demographics['BirthYear']

In [39]:
demographics.sample(3)

Unnamed: 0,PatientID,BirthYear,Gender,State,race,ethnicity,StartDate,age
15,FF4A5896815CF,1937,M,,White,not_hispanic_latino,2014-11-14,77
3368,FB85C7A701047,1935,F,FL,White,not_hispanic_latino,2019-12-17,84
1838,F9F28422A326F,1933,M,AZ,White,not_hispanic_latino,2012-10-18,79


In [40]:
demographics = demographics.drop(columns = ['BirthYear', 'StartDate'])

In [41]:
demographics.sample(3)

Unnamed: 0,PatientID,Gender,State,race,ethnicity,age
3893,FF20AD67C25AD,F,IN,White,not_hispanic_latino,75
4669,F6F1C0B9CF8A3,F,NJ,White,not_hispanic_latino,78
1384,F1A67DA25872C,M,AL,White,,53


Practice type

In [42]:
practice = pd.read_csv('../data/Practice.csv')

In [44]:
#filter for patients in the cohort
practice = practice[practice['PatientID'].isin(cohort_IDs)]

In [45]:
row_ID(practice)

(7036, 6461)

In [46]:
practice_counts = practice['PracticeType'].value_counts()
print(practice_counts)

PracticeType
COMMUNITY    5751
ACADEMIC     1285
Name: count, dtype: int64


In cases where patients have multiple responses for PracticeType, need to address this by labeling as "BOTH"

In [57]:
#First determine how many practice types are present for each patient
practice_unique_count = (
    practice.groupby('PatientID')['PracticeType'].agg('nunique')
    .to_frame()
    .reset_index()
    .rename(columns = {'PracticeType': 'n_type'})
)

In [63]:
practice_n = pd.merge(practice, practice_unique_count, on = 'PatientID')

In [64]:
#Label patients with more than practice type with "BOTH"
practice_n['practice_type'] = (
    np.where(practice_n['n_type'] == 1, practice_n['PracticeType'], 'BOTH')
)

In [65]:
practice_n = (
    practice_n.drop_duplicates(subset = ['PatientID'], keep = 'first')
    .filter(items = ['PatientID', 'practice_type'])
)

In [66]:
row_ID(practice_n)

(6461, 6461)

In [67]:
practice_n.sample(3)

Unnamed: 0,PatientID,practice_type
4268,F17431238E43F,COMMUNITY
3393,F5F2FEA44D00D,COMMUNITY
5477,F2F49C3740210,ACADEMIC


In [69]:
practice_n['practice_type'].value_counts()

practice_type
COMMUNITY    5179
ACADEMIC      816
BOTH          466
Name: count, dtype: int64

In [70]:
demographics = pd.merge(demographics, practice_n, on = 'PatientID')

In [74]:
demographics.sample(3)

Unnamed: 0,PatientID,Gender,State,race,ethnicity,age,p_type,practice_type
2890,F37F3686431FA,M,FL,White,,79,COMMUNITY,COMMUNITY
4336,FA33C4E7930B8,F,IL,White,not_hispanic_latino,84,COMMUNITY,COMMUNITY
4670,F226BADFF2433,M,NJ,,,74,COMMUNITY,COMMUNITY


Gender:

In [75]:
demographics['Gender'].value_counts()

Gender
M       4726
F       1733
NULL       2
Name: count, dtype: int64

Missing values noted, but will avoid imputation for now, with plans to address missingness in ultimate step; instead label as 'NULL'

In [76]:
demographics['Gender'] = demographics['Gender'].fillna('NULL')
demographics['Gender'].value_counts()

Gender
M       4726
F       1733
NULL       2
Name: count, dtype: int64

In [77]:
#Convert column name to snake
demographics = demographics.rename(columns = {'Gender': 'gender'})

In [None]:
demographics['Gender'].valuecounts()

State:

In [81]:
# Group states into Census-Bureau regions  
state_dict = { 
    'ME': 'northeast', 
    'NH': 'northeast',
    'VT': 'northeast', 
    'MA': 'northeast',
    'CT': 'northeast',
    'RI': 'northeast',  
    'NY': 'northeast', 
    'NJ': 'northeast', 
    'PA': 'northeast', 
    'IL': 'midwest', 
    'IN': 'midwest', 
    'MI': 'midwest', 
    'OH': 'midwest', 
    'WI': 'midwest',
    'IA': 'midwest',
    'KS': 'midwest',
    'MN': 'midwest',
    'MO': 'midwest', 
    'NE': 'midwest',
    'ND': 'midwest',
    'SD': 'midwest',
    'DE': 'south',
    'FL': 'south',
    'GA': 'south',
    'MD': 'south',
    'NC': 'south', 
    'SC': 'south',
    'VA': 'south',
    'DC': 'south',
    'WV': 'south',
    'AL': 'south',
    'KY': 'south',
    'MS': 'south',
    'TN': 'south',
    'AR': 'south',
    'LA': 'south',
    'OK': 'south',
    'TX': 'south',
    'AZ': 'west',
    'CO': 'west',
    'ID': 'west',
    'MT': 'west',
    'NV': 'west',
    'NM': 'west',
    'UT': 'west',
    'WY': 'west',
    'AK': 'west',
    'CA': 'west',
    'HI': 'west',
    'OR': 'west',
    'WA': 'west',
    'PR': 'unknown'
}

demographics['region'] = demographics['State'].map(state_dict)

In [82]:
demographics['region'] = demographics['region'].fillna('unknown')

In [84]:
demographics['region'].value_counts()

region
south        2578
unknown      1523
west          809
northeast     799
midwest       752
Name: count, dtype: int64

In [85]:
demographics = demographics.drop(columns = ['State'])

In [86]:
demographics.sample(3)

Unnamed: 0,PatientID,gender,race,ethnicity,age,p_type,practice_type,region
4048,F8F574106E028,M,White,,60,COMMUNITY,COMMUNITY,northeast
1235,F7C9DBF9EAAB9,M,White,not_hispanic_latino,82,ACADEMIC,ACADEMIC,unknown
2083,F9C973D367070,M,White,not_hispanic_latino,82,COMMUNITY,COMMUNITY,west


In [87]:
%whos DataFrame

Variable                Type         Data/Info
----------------------------------------------
cohort                  DataFrame              PatientID      <...>\n[6461 rows x 3 columns]
demographics            DataFrame              PatientID gende<...>\n[6461 rows x 8 columns]
enhanced_adv            DataFrame               PatientID Diag<...>[13129 rows x 13 columns]
practice                DataFrame               PatientID     <...>\n[7036 rows x 4 columns]
practice_n              DataFrame              PatientID pract<...>\n[6461 rows x 2 columns]
practice_unique_count   DataFrame              PatientID  n_ty<...>\n[6461 rows x 2 columns]


In [88]:
#Keep cohort, demographics, enhanced_adv
del practice
del practice_n
del practice_unique_count

3. Clean enhanced_adv dataset

In [89]:
row_ID(enhanced_adv)

(13129, 13129)

In [90]:
#filter for patients in the cohort
enhanced_adv = enhanced_adv[enhanced_adv['PatientID'].isin(cohort_IDs)]

In [91]:
row_ID(enhanced_adv)

(6461, 6461)

In [92]:
enhanced_adv.sample(3)

Unnamed: 0,PatientID,DiagnosisDate,AdvancedDiagnosisDate,PrimarySite,DiseaseGrade,GroupStage,TStage,NStage,MStage,SmokingStatus,Surgery,SurgeryDate,SurgeryType
7533,FF46D611EA47A,2016-06-22,2016-06-22,Bladder,High grade (G2/G3/G4),Unknown/not documented,Unknown/not documented,Unknown/not documented,Unknown/not documented,History of smoking,False,,
11133,F97FEC1F3F6FB,2020-09-25,2020-09-25,Renal Pelvis,Unknown/not documented,Stage IV,Unknown/not documented,Unknown/not documented,M1,History of smoking,False,,
4509,F64C0B0E8CCE4,2014-09-26,2014-09-26,Bladder,High grade (G2/G3/G4),Stage IV,T2,N0,M0,History of smoking,False,,


GroupStage

In [93]:
stage_counts = enhanced_adv['GroupStage'].value_counts()
print(stage_counts)

GroupStage
Unknown/not documented    2978
Stage IV                  2055
Stage II                   459
Stage III                  263
Stage IVB                  172
Stage IIIA                 164
Stage IVA                  119
Stage I                    113
Stage IIIB                 104
Stage 0is                   20
Stage 0a                    14
Name: count, dtype: int64


In [94]:
# Dictionary for regrouping stages
stage_dict = { 
    'Stage 0': '0',
    'Stage 0is': '0',
    'Stage 0a': '0',
    'Stage I': 'I',
    'Stage II': 'II',
    'Stage III': 'III',
    'Stage IIIA': 'III',
    'Stage IIIB': 'III',
    'Stage IV': 'IV',
    'Stage IVA': 'IV',
    'Stage IVB': 'IV',
    'Unknown/not documented': 'unknown'
}

In [95]:
enhanced_adv['stage'] = enhanced_adv['GroupStage'].map(stage_dict)

In [96]:
stage_counts = enhanced_adv['stage'].value_counts()
print(stage_counts)

stage
unknown    2978
IV         2346
III         531
II          459
I           113
0            34
Name: count, dtype: int64


In [97]:
enhanced_adv = enhanced_adv.drop(columns = ['GroupStage'])

AdvancedDiagnosisDate

In [98]:
#make variable snake case
enhanced_adv = enhanced_adv.rename(columns = {'AdvancedDiagnosisDate': 'adv_diagnosis_date'})

In [99]:
#convert to date
enhanced_adv['adv_diagnosis_date'] = pd.to_datetime(enhanced_adv['adv_diagnosis_date'], format="%Y-%m-%d")

In [100]:
#confirm datetime conversion successful
print(enhanced_adv['adv_diagnosis_date'].dtype)

datetime64[ns]


In [101]:
enhanced_adv.loc[:, 'adv_diagnosis_year'] = enhanced_adv['adv_diagnosis_date'].dt.year

In [102]:
enhanced_adv.sample(3)

Unnamed: 0,PatientID,DiagnosisDate,adv_diagnosis_date,PrimarySite,DiseaseGrade,TStage,NStage,MStage,SmokingStatus,Surgery,SurgeryDate,SurgeryType,stage,adv_diagnosis_year
40,F5EF4C587B798,2021-03-31,2021-03-31,Ureter,High grade (G2/G3/G4),T3,N1,M0,History of smoking,True,2021-06-25,Nephroureterectomy,III,2021
3885,F6845195D41DF,2008-07-14,2015-04-16,Bladder,High grade (G2/G3/G4),Tis,N0,M0,No history of smoking,True,2008-07-14,Complete (radical) cystectomy,unknown,2015
4361,F4DCBE8D70448,2021-11-12,2021-11-12,Bladder,High grade (G2/G3/G4),T2,Unknown/not documented,M1,No history of smoking,False,,,IV,2021


DiagnosisDate

In [103]:
enhanced_adv = enhanced_adv.rename(columns = {'DiagnosisDate': 'diagnosis_date'})

In [104]:
enhanced_adv['diagnosis_date'] = pd.to_datetime(enhanced_adv['diagnosis_date'], format="%Y-%m-%d")

In [105]:
# Missing diagnosis_date will be replaced with adv_date; other dates will be left untouched. 
enhanced_adv['diagnosis_date'] = (
    np.where(enhanced_adv['diagnosis_date'].isna(), enhanced_adv['adv_diagnosis_date'], enhanced_adv['diagnosis_date'])
)

In [106]:
#confirm datetime conversion successful
print(enhanced_adv['adv_diagnosis_date'].dtype)

datetime64[ns]


Time from diagnosis date to advanced date

In [107]:
enhanced_adv.loc[:, 'delta_adv_diagnosis'] = (enhanced_adv['adv_diagnosis_date'] - enhanced_adv['diagnosis_date']).dt.days

In [108]:
enhanced_adv.sample(3)

Unnamed: 0,PatientID,diagnosis_date,adv_diagnosis_date,PrimarySite,DiseaseGrade,TStage,NStage,MStage,SmokingStatus,Surgery,SurgeryDate,SurgeryType,stage,adv_diagnosis_year,delta_adv_diagnosis
11070,F3ACD3B2AB6C8,2014-11-14,2020-10-09,Bladder,High grade (G2/G3/G4),Unknown/not documented,Unknown/not documented,Unknown/not documented,No history of smoking,True,2017-05-19,Cystoprostatectomy,unknown,2020,2156
12400,FB076CDF1B3FD,2017-05-26,2018-09-11,Bladder,High grade (G2/G3/G4),T1,Unknown/not documented,Unknown/not documented,History of smoking,True,2018-09-11,Other,unknown,2018,473
5955,F3E5D35BC3607,2023-04-17,2023-04-17,Renal Pelvis,Unknown/not documented,Unknown/not documented,Unknown/not documented,M1,History of smoking,False,,,IV,2023,0


PrimarySite

In [110]:
enhanced_adv['PrimarySite'].value_counts()

PrimarySite
Bladder         4919
Renal Pelvis     897
Ureter           599
Urethra           46
Name: count, dtype: int64

In [111]:
enhanced_adv = enhanced_adv.rename(columns = {'PrimarySite': 'primary_site'})

In [113]:
enhanced_adv['primary_site'] = enhanced_adv['primary_site'].replace({'Bladder': 'lower_tract'})
enhanced_adv['primary_site'] = enhanced_adv['primary_site'].replace({'Urethra': 'lower_tract'})
enhanced_adv['primary_site'] = enhanced_adv['primary_site'].replace({'Renal Pelvis': 'upper_tract'})
enhanced_adv['primary_site'] = enhanced_adv['primary_site'].replace({'Ureter': 'upper_tract'})
enhanced_adv['primary_site'] = enhanced_adv['primary_site'].fillna('unknown')

In [114]:
site_counts = enhanced_adv['primary_site'].value_counts()
print(site_counts)

primary_site
lower_tract    4965
upper_tract    1496
Name: count, dtype: int64


DiseaseGrade

In [115]:
enhanced_adv['DiseaseGrade'].value_counts()

DiseaseGrade
High grade (G2/G3/G4)     5441
Unknown/not documented     711
Low grade (G1)             309
Name: count, dtype: int64

In [116]:
enhanced_adv = enhanced_adv.rename(columns = {'DiseaseGrade': 'disease_grade'})

In [117]:
enhanced_adv['disease_grade'] = enhanced_adv['disease_grade'].replace({'High grade (G2/G3/G4)': 'high_grade'})
enhanced_adv['disease_grade'] = enhanced_adv['disease_grade'].replace({'Unknown/not documented': 'unknown'})
enhanced_adv['disease_grade'] = enhanced_adv['disease_grade'].replace({'Low grade (G1)': 'low_grade'})

In [118]:
enhanced_adv['disease_grade'].value_counts()

disease_grade
high_grade    5441
unknown        711
low_grade      309
Name: count, dtype: int64

TStage

In [119]:
enhanced_adv['TStage'].value_counts()

TStage
Unknown/not documented    2226
T2                        1140
T3                         738
T1                         720
T4                         295
T4a                        232
T3a                        200
T3b                        196
T2b                        192
T2a                        158
TX                         136
Ta                         126
T4b                         54
Tis                         42
T0                           6
Name: count, dtype: int64

In [120]:
enhanced_adv = enhanced_adv.rename(columns = {'TStage': 't_stage'})

In [121]:
# Dictionary for regrouping t stages
t_stage_dict = { 
    'Unknown/not documented': 'unknown',
    'T2': 'T2',
    'T3': 'T3',
    'T1': 'T1',
    'T4': 'T4',
    'T4a': 'T4',
    'T3a': 'T3',
    'T3b': 'T3',
    'T2b': 'T2',
    'T2a': 'T2',
    'TX': 'unknown',
    'Ta': 'Ta',
    'T4b': 'T4',
    'Tis': 'Tis',
    'T0': 'T0'
    
}

enhanced_adv['t_stage'] = enhanced_adv['t_stage'].map(t_stage_dict)

In [122]:
enhanced_adv['t_stage'].value_counts()

t_stage
unknown    2362
T2         1490
T3         1134
T1          720
T4          581
Ta          126
Tis          42
T0            6
Name: count, dtype: int64

NStage

In [123]:
enhanced_adv['NStage'].value_counts()

NStage
Unknown/not documented    3107
N0                        1315
NX                         743
N2                         612
N1                         497
N3                         187
Name: count, dtype: int64

In [124]:
enhanced_adv = enhanced_adv.rename(columns = {'NStage': 'n_stage'})

In [125]:
# Dictionary for regrouping n stages
n_stage_dict = { 
    'Unknown/not documented': 'unknown',
    'N0': 'N0',
    'NX': 'unknown',
    'N2': 'N2',
    'N1': 'N1',
    'N3': 'N3'
}

enhanced_adv['n_stage'] = enhanced_adv['n_stage'].map(n_stage_dict)

In [126]:
enhanced_adv['n_stage'].value_counts()

n_stage
unknown    3850
N0         1315
N2          612
N1          497
N3          187
Name: count, dtype: int64

MStage

In [127]:
enhanced_adv['MStage'].value_counts()

MStage
M0                        2630
Unknown/not documented    1878
M1                        1330
MX                         393
M1b                        139
M1a                         91
Name: count, dtype: int64

In [128]:
enhanced_adv = enhanced_adv.rename(columns = {'MStage': 'm_stage'})

In [129]:
# Dictionary for regrouping m stages
m_stage_dict = { 
    'M0': 'M0',
    'Unknown/not documented': 'unknown',
    'M1': 'M1',
    'MX': 'unknown',
    'M1b': 'M1',
    'M1a': 'M1'
}

enhanced_adv['m_stage'] = enhanced_adv['m_stage'].map(m_stage_dict)

In [130]:
enhanced_adv['m_stage'].value_counts()

m_stage
M0         2630
unknown    2271
M1         1560
Name: count, dtype: int64

SmokingStatus

In [131]:
enhanced_adv['SmokingStatus'].value_counts()

SmokingStatus
History of smoking        4742
No history of smoking     1677
Unknown/not documented      42
Name: count, dtype: int64

In [132]:
enhanced_adv = enhanced_adv.rename(columns = {'SmokingStatus': 'smoking_status'})

In [133]:
enhanced_adv['smoking_status'] = enhanced_adv['smoking_status'].replace({'History of smoking': 'smoker'})
enhanced_adv['smoking_status'] = enhanced_adv['smoking_status'].replace({'No history of smoking': 'never_smoker'})
enhanced_adv['smoking_status'] = enhanced_adv['smoking_status'].replace({'Unknown/not documented': 'unknown'})

In [134]:
enhanced_adv['smoking_status'].value_counts()

smoking_status
smoker          4742
never_smoker    1677
unknown           42
Name: count, dtype: int64

Surgery

In [135]:
enhanced_adv['Surgery'].value_counts()

Surgery
False    3345
True     3116
Name: count, dtype: int64

In [136]:
enhanced_adv = enhanced_adv.rename(columns = {'Surgery': 'surgery_status'})

In [137]:
enhanced_adv['surgery_status'].value_counts()

surgery_status
False    3345
True     3116
Name: count, dtype: int64

In [138]:
print(enhanced_adv['surgery_status'].dtype)

bool


SurgeryDate

In [139]:
print(enhanced_adv['SurgeryDate'].dtype)

object


In [140]:
enhanced_adv = enhanced_adv.rename(columns = {'SurgeryDate': 'surgery_date'})

In [141]:
enhanced_adv['surgery_date'] = pd.to_datetime(enhanced_adv['surgery_date'], format="%Y-%m-%d")

In [142]:
print(enhanced_adv['surgery_date'].dtype)

datetime64[ns]


Leaving cases where there is no surgery date, empty for now, preserving date formatting

SurgeryType

In [143]:
enhanced_adv['SurgeryType'].value_counts()

SurgeryType
Cystoprostatectomy               1210
Nephroureterectomy                882
Complete (radical) cystectomy     539
Partial cystectomy                126
Nephrectomy                       124
Ureterectomy                       98
Cystectomy, NOS                    73
Other                              58
Unknown/not documented              4
Urethrectomy                        2
Name: count, dtype: int64

In [144]:
enhanced_adv = enhanced_adv.rename(columns = {'SurgeryType': 'surgery_type'})

In [145]:
# Dictionary for regrouping surgery type
surgery_type_dict = { 
    'Cystoprostatectomy': 'cystoprostatectomy',
    'Nephroureterectomy': 'nephroureterectomy',
    'Complete (radical) cystectomy': 'radical_cystectomy',
    'Partial cystectomy': 'partial_cystectomy',
    'Nephrectomy': 'nephrectomy',
    'Ureterectomy': 'ureterectomy',
    'Cystectomy, NOS': 'cystectomy_nos',
    'Other': 'other_surgery',
    'Unknown/not documented': 'unknown_surgery',
    'Urethrectomy': 'urethrectomy'
}

enhanced_adv['surgery_type'] = enhanced_adv['surgery_type'].map(surgery_type_dict)

In [146]:
enhanced_adv['surgery_type'] = enhanced_adv['surgery_type'].fillna('no_surgery')

In [147]:
enhanced_adv['surgery_type'].value_counts()

surgery_type
no_surgery            3345
cystoprostatectomy    1210
nephroureterectomy     882
radical_cystectomy     539
partial_cystectomy     126
nephrectomy            124
ureterectomy            98
cystectomy_nos          73
other_surgery           58
unknown_surgery          4
urethrectomy             2
Name: count, dtype: int64

In [148]:
#Final enhanced_adv dataframe
enhanced_adv.sample(3)

Unnamed: 0,PatientID,diagnosis_date,adv_diagnosis_date,primary_site,disease_grade,t_stage,n_stage,m_stage,smoking_status,surgery_status,surgery_date,surgery_type,stage,adv_diagnosis_year,delta_adv_diagnosis
6701,FE7962AC6ADE3,2003-05-07,2014-09-03,lower_tract,unknown,Tis,N0,unknown,smoker,True,2003-06-27,radical_cystectomy,unknown,2014,4137
7341,F4EA45F24CCA3,2017-11-20,2020-02-04,lower_tract,high_grade,unknown,unknown,unknown,smoker,True,2018-05-03,cystectomy_nos,unknown,2020,806
7308,F0214E30879AC,2020-03-01,2022-10-10,lower_tract,high_grade,T3,N0,M0,smoker,True,2020-06-02,cystoprostatectomy,III,2022,953
