# Data Cleaning
**This notebook prepares Flatiron Health CSV files for patients with advanced urothelial cancer treated with first-line checkpoint inhibitors or chemotherapy. Refer to the "defining_cohort" notebook for cohort selection details. Each CSV is cleaned using the flatiron_cleaner package. The cleaned dataframes are then merged into a single dataset, which will serve as the input for unsupervised clustering to identify clinically or biologically meaningful subgroups.**

## Import packages

In [1]:
import numpy as np
import pandas as pd

from flatiron_cleaner import DataProcessorUrothelial
from flatiron_cleaner import merge_dataframes

## Import data

In [2]:
df = pd.read_csv('../outputs/full_cohort.csv')

In [3]:
df.head(5)

Unnamed: 0,PatientID,LineName,StartDate
0,F5AAF96C85477,Pembrolizumab,2021-07-08
1,F788831A66E9A,Pembrolizumab,2023-02-22
2,F75847DF35E43,Atezolizumab,2019-04-25
3,F6E944C1709E6,Pembrolizumab,2020-08-12
4,F75087BE5F959,Pembrolizumab,2020-09-09


In [4]:
df.shape

(6461, 3)

In [5]:
ids = df.PatientID.to_list()

## Clean CSV files 

In [6]:
# Initialize class 
processor = DataProcessorUrothelial()

In [7]:
# Process Enhanced_AdvUrothelial.csv
enhanced_df = processor.process_enhanced(file_path = '../data/Enhanced_AdvUrothelial.csv',
                                         patient_ids = ids)

2025-04-09 14:48:11,566 - INFO - Successfully read Enhanced_AdvUrothelial.csv file with shape: (13129, 13) and unique PatientIDs: 13129
2025-04-09 14:48:11,567 - INFO - Filtering for 6461 specific PatientIDs
2025-04-09 14:48:11,570 - INFO - Successfully filtered Enhanced_AdvUrothelial.csv file with shape: (6461, 13) and unique PatientIDs: 6461
2025-04-09 14:48:11,585 - INFO - Successfully processed Enhanced_AdvUrothelial.csv file with final shape: (6461, 13) and unique PatientIDs: 6461


In [8]:
# Process Demographics.csv 
demographics_df = processor.process_demographics(file_path = '../data/Demographics.csv',
                                                 index_date_df = df,
                                                 index_date_column = 'StartDate')

2025-04-09 14:48:11,598 - INFO - Successfully read Demographics.csv file with shape: (13129, 6) and unique PatientIDs: 13129
2025-04-09 14:48:11,610 - INFO - Successfully processed Demographics.csv file with final shape: (6461, 6) and unique PatientIDs: 6461


In [9]:
# Process Enhanced_AdvUrothelialBiomarkers.csv
biomarkers_df = processor.process_biomarkers(file_path = '../data/Enhanced_AdvUrothelialBiomarkers.csv',
                                             index_date_df = df, 
                                             index_date_column = 'StartDate',
                                             days_before = None, 
                                             days_after = 14)

2025-04-09 14:48:11,629 - INFO - Successfully read Enhanced_AdvUrothelialBiomarkers.csv file with shape: (9924, 19) and unique PatientIDs: 4251
2025-04-09 14:48:11,639 - INFO - Successfully merged Enhanced_AdvUrothelialBiomarkers.csv df with index_date_df resulting in shape: (6326, 20) and unique PatientIDs: 2623
2025-04-09 14:48:11,681 - INFO - Successfully processed Enhanced_AdvUrothelialBiomarkers.csv file with final shape: (6461, 4) and unique PatientIDs: 6461


In [10]:
biomarkers_df.PDL1_percent_staining.value_counts(dropna = False)

PDL1_percent_staining
NaN          6372
5% - 9%        25
10% - 19%      16
20% - 29%      14
30% - 39%       9
90% - 99%       6
1%              5
50% - 59%       3
70% - 79%       3
80% - 89%       3
40% - 49%       2
2% - 4%         1
60% - 69%       1
100%            1
0%              0
< 1%            0
Name: count, dtype: int64

In [11]:
def map_pdl1(value):
    if pd.isna(value):  # leave missing as is
        return value
    elif value in ['0%', '< 1%']:
        return '0%'
    else:
        return '>=1%'

biomarkers_df['PDL1_binary'] = biomarkers_df['PDL1_percent_staining'].apply(map_pdl1)

In [12]:
biomarkers_df.PDL1_binary.value_counts(dropna = False)

PDL1_binary
NaN     6372
>=1%      89
Name: count, dtype: int64

In [13]:
biomarkers_df = biomarkers_df.drop(columns = ['PDL1_percent_staining'])

In [14]:
# Process ECOG.csv
ecog_df = processor.process_ecog(file_path = '../data/ECOG.csv', 
                                 index_date_df = df,
                                 index_date_column = 'StartDate',
                                 days_before = 90,
                                 days_after = 0,
                                 days_before_further = 180)

2025-04-09 14:48:11,762 - INFO - Successfully read ECOG.csv file with shape: (184794, 4) and unique PatientIDs: 9933
2025-04-09 14:48:11,800 - INFO - Successfully merged ECOG.csv df with index_date_df resulting in shape: (118838, 5) and unique PatientIDs: 5453
2025-04-09 14:48:11,885 - INFO - Successfully processed ECOG.csv file with final shape: (6461, 3) and unique PatientIDs: 6461


In [15]:
# Process Vitals.csv
vitals_df = processor.process_vitals(file_path = '../data/Vitals.csv',
                                     index_date_df = df,
                                     index_date_column = 'StartDate',
                                     weight_days_before = 90,
                                     days_after = 0,
                                     vital_summary_lookback = 180, 
                                     abnormal_reading_threshold = 1)

2025-04-09 14:48:15,405 - INFO - Successfully read Vitals.csv file with shape: (3604484, 16) and unique PatientIDs: 13109
2025-04-09 14:48:17,007 - INFO - Successfully merged Vitals.csv df with index_date_df resulting in shape: (2038026, 17) and unique PatientIDs: 6461
2025-04-09 14:48:17,889 - INFO - Successfully processed Vitals.csv file with final shape: (6461, 8) and unique PatientIDs: 6461


In [16]:
# Process Lab.csv
labs_df = processor.process_labs(file_path = '../data/Lab.csv',
                                 index_date_df = df,
                                 index_date_column = 'StartDate',
                                 days_before = 90,
                                 days_after = 0,
                                 summary_lookback = 180)

2025-04-09 14:48:30,082 - INFO - Successfully read Lab.csv file with shape: (9373598, 17) and unique PatientIDs: 12700
2025-04-09 14:48:33,315 - INFO - Successfully merged Lab.csv df with index_date_df resulting in shape: (5615579, 18) and unique PatientIDs: 6408
2025-04-09 14:48:45,296 - INFO - Successfully processed Lab.csv file with final shape: (6461, 76) and unique PatientIDs: 6461


In [17]:
# Process MedicationAdministration.csv
medications_df = processor.process_medications(file_path = '../data/MedicationAdministration.csv',
                                               index_date_df = df,
                                               index_date_column = 'StartDate',
                                               days_before = 90,
                                               days_after = 0)

2025-04-09 14:48:46,671 - INFO - Successfully read MedicationAdministration.csv file with shape: (997836, 11) and unique PatientIDs: 10983
2025-04-09 14:48:47,041 - INFO - Successfully merged MedicationAdministration.csv df with index_date_df resulting in shape: (565555, 12) and unique PatientIDs: 6341
2025-04-09 14:48:47,090 - INFO - Successfully processed MedicationAdministration.csv file with final shape: (6461, 9) and unique PatientIDs: 6461


In [18]:
# Process Diagnsois.csv 
diagnosis_df = processor.process_diagnosis(file_path = '../data/Diagnosis.csv',
                                           index_date_df = df,
                                           index_date_column = 'StartDate',
                                           days_before = None,
                                           days_after = 0)

2025-04-09 14:48:47,510 - INFO - Successfully read Diagnosis.csv file with shape: (625348, 6) and unique PatientIDs: 13129
2025-04-09 14:48:47,631 - INFO - Successfully merged Diagnosis.csv df with index_date_df resulting in shape: (309101, 7) and unique PatientIDs: 6461
2025-04-09 14:48:48,649 - INFO - Successfully processed Diagnosis.csv file with final shape: (6461, 40) and unique PatientIDs: 6461


## Merge dataframes

In [19]:
final_df = merge_dataframes(enhanced_df,
                            demographics_df,
                            biomarkers_df,
                            ecog_df,
                            vitals_df,
                            labs_df,
                            medications_df,
                            diagnosis_df)

2025-04-09 14:48:48,660 - INFO - Anticipated number of merges: 7
2025-04-09 14:48:48,660 - INFO - Anticipated number of columns in final dataframe presuming all columns are unique except for PatientID: 152
2025-04-09 14:48:48,663 - INFO - Dataset 1 shape: (6461, 13), unique PatientIDs: 6461
2025-04-09 14:48:48,665 - INFO - Dataset 2 shape: (6461, 6), unique PatientIDs: 6461
2025-04-09 14:48:48,666 - INFO - Dataset 3 shape: (6461, 4), unique PatientIDs: 6461
2025-04-09 14:48:48,669 - INFO - Dataset 4 shape: (6461, 3), unique PatientIDs: 6461
2025-04-09 14:48:48,671 - INFO - Dataset 5 shape: (6461, 8), unique PatientIDs: 6461
2025-04-09 14:48:48,671 - INFO - Dataset 6 shape: (6461, 76), unique PatientIDs: 6461
2025-04-09 14:48:48,672 - INFO - Dataset 7 shape: (6461, 9), unique PatientIDs: 6461
2025-04-09 14:48:48,673 - INFO - Dataset 8 shape: (6461, 40), unique PatientIDs: 6461
2025-04-09 14:48:48,679 - INFO - After merge 1 shape: (6461, 18), unique PatientIDs 6461
2025-04-09 14:48:48,68

In [20]:
final_df.shape

(6461, 152)

In [21]:
final_df.head(2)

Unnamed: 0,PatientID,PrimarySite,DiseaseGrade,SmokingStatus,Surgery,GroupStage_mod,TStage_mod,NStage_mod,MStage_mod,SurgeryType_mod,...,van_walraven_score,lymph_met,thoracic_met,liver_met,bone_met,brain_met,adrenal_met,peritoneum_met,gi_met,other_met
0,F0016E985D839,Renal Pelvis,High grade (G2/G3/G4),History of smoking,1,IV,T3,N1,M0,upper,...,4.0,0,0,0,0,0,0,0,0,0
1,F001E5D4C6FA0,Bladder,Low grade (G1),History of smoking,1,unknown,T1,unknown,unknown,bladder,...,,0,0,0,0,0,0,0,0,0


## Export dataframe

In [22]:
final_df.to_csv('../outputs/final_df.csv', index = False)

In [23]:
# Save dtypes
final_df.dtypes.apply(lambda x: x.name).to_csv('../outputs/final_df_dtypes.csv')