In [55]:
import pandas as pd
import os

# Set the directory where the CSV files are stored
directory_path = 'patient_cohort'

# List of file names
file_names = [
    'central_line_patients',
    'hai_positive_patients',
    'invasive_device_cauti',
    'invasive_device_clabsi',
    'invasive_device_vap',
    'patient_detail',
    'lab_results_cauti',
    'lab_results_clabsi',
    'lab_results_vap',
    'patient_immune_compromised',
    'patient_immunosuppressant_drugs',
    'patient_recent_surgeries',
    'patient_vital_signs',
    'urinary_catheter_patients',
    'ventilator_patients'
]

# Dictionary to store dataframes
dataframes = {}

# Loop to read each file and store it in a DataFrame
for file_name in file_names:
    file_path = os.path.join(directory_path, f"{file_name}.csv")
    dataframes[file_name] = pd.read_csv(file_path)

# Now, you can access each DataFrame using its name. For example:
df_central_line_patients = dataframes['central_line_patients']


In [56]:
import pandas as pd
import numpy as np

# Assume central_line_patients, urinary_catheter_patients, and ventilator_patients are loaded as DataFrames
central_line_patients = dataframes['central_line_patients']
urinary_catheter_patients = dataframes['urinary_catheter_patients']
ventilator_patients = dataframes['ventilator_patients']

# Replace NaN values in 'stay_id' with a placeholder (-1)
central_line_patients['stay_id'].fillna(-1, inplace=True)
urinary_catheter_patients['stay_id'].fillna(-1, inplace=True)
ventilator_patients['stay_id'].fillna(-1, inplace=True)

# Start by making copies of the original DataFrames with only the required columns
central_line_df = central_line_patients[['subject_id', 'hadm_id', 'stay_id']].copy()
urinary_catheter_df = urinary_catheter_patients[['subject_id', 'hadm_id', 'stay_id']].copy()
ventilator_df = ventilator_patients[['subject_id', 'hadm_id', 'stay_id']].copy()

# Add the 'InvasiveDevice' column to each DataFrame
central_line_df['InvasiveDevice'] = 'Central Line'
urinary_catheter_df['InvasiveDevice'] = 'Urinary Catheter'
ventilator_df['InvasiveDevice'] = 'Ventilator'

# Assuming central_line_df, urinary_catheter_df, and ventilator_df are your original DataFrames
dfs = [central_line_df, urinary_catheter_df, ventilator_df]
names = ['Central Line', 'Urinary Catheter', 'Ventilator']

# Initialize an empty DataFrame for the final cohort
final_cohort = pd.DataFrame(columns=['subject_id', 'hadm_id', 'stay_id', 'InvasiveDevice'])

# Loop through each DataFrame to merge
for df, name in zip(dfs, names):
    # Create a temporary DataFrame with the name of the invasive device added
    temp_df = df.copy()
    temp_df['InvasiveDevice'] = name
    
    # Merge with the existing final cohort DataFrame
    if final_cohort.empty:
        final_cohort = temp_df
    else:
        final_cohort = pd.merge(final_cohort, temp_df, on=['subject_id', 'hadm_id', 'stay_id'], how='outer', suffixes=('', f'_{name}'))

# Combine the InvasiveDevice columns into a single column
final_cohort['InvasiveDevice'] = final_cohort.filter(like='InvasiveDevice').apply(lambda x: ', '.join(x.dropna()), axis=1)

# Drop the extra InvasiveDevice columns
final_cohort.drop(columns=final_cohort.filter(like='InvasiveDevice_').columns, inplace=True)

In [15]:
final_cohort.head()

Unnamed: 0,subject_id,hadm_id,stay_id,InvasiveDevice
0,14792425,21700173,31387676.0,Central Line
1,14792425,21700173,31387676.0,Central Line
2,15104738,27512296,35483572.0,"Central Line, Ventilator"
3,13282748,28141617,37907191.0,Central Line
4,13282748,28141617,37907191.0,Central Line


In [16]:
hai_positive_patients_df = dataframes['hai_positive_patients']
invasive_device_cauti_df = dataframes['invasive_device_cauti']
invasive_device_clabsi_df = dataframes['invasive_device_clabsi']
invasive_device_vap_df = dataframes['invasive_device_vap']
patient_detail_df = dataframes['patient_detail']
lab_results_cauti_df = dataframes['lab_results_cauti']
lab_results_clabsi_df = dataframes['lab_results_clabsi']
lab_results_vap_df = dataframes['lab_results_vap']
patient_immune_compromised_df = dataframes['patient_immune_compromised']
patient_immunosuppressant_drugs_df = dataframes['patient_immunosuppressant_drugs']
patient_recent_surgeries_df = dataframes['patient_recent_surgeries']
patient_vital_signs_df = dataframes['patient_vital_signs']

In [52]:
import pandas as pd

def merge_dataframes(dfs_with_names, common_cols):
    # Initialise the final DataFrame with the first DataFrame in the list
    final_df = dfs_with_names[0][1].copy()
    
    # Loop through and merge each subsequent DataFrame
    for name, df in dfs_with_names[1:]:
        join_cols = [col for col in common_cols if col in df.columns]
        suffix = f'_from_{name}'
        final_df = pd.merge(final_df, df, on=join_cols, suffixes=('', suffix), how='left')
        
    return final_df

# Define your lab results DataFrames with names
lab_results_dfs_with_names = [
    ('cauti_lab', lab_results_cauti_df),
    ('clabsi_lab', lab_results_clabsi_df),
    ('vap_lab', lab_results_vap_df)
]

# Merge lab results
common_cols = ['subject_id', 'hadm_id', 'stay_id']
final_lab_results = merge_dataframes(lab_results_dfs_with_names, common_cols)

# Define your invasive devices DataFrames with names
invasive_device_dfs_with_names = [
    ('cauti_device', invasive_device_cauti_df), 
    ('clabsi_device', invasive_device_clabsi_df),
    ('vap_device', invasive_device_vap_df)
]

# Merge invasive devices
final_invasive_device = merge_dataframes(invasive_device_dfs_with_names, common_cols)


In [53]:
# Filter rows where 'long_title' is not empty or not null
filtered_patient_immune_compromised_df = patient_immune_compromised_df[patient_immune_compromised_df['long_title'].notna() & patient_immune_compromised_df['long_title'].ne('')]

# Add a 'Immunocompromised' column, set it to True
filtered_patient_immune_compromised_df['Immunocompromised'] = True

# Merge this filtered dataframe with patient_detail_df on 'subject_id'
merged_patient_detail = pd.merge(patient_detail_df, 
                                 filtered_patient_immune_compromised_df[['subject_id', 'Immunocompromised']], 
                                 on='subject_id', 
                                 how='left')

# Fill NaN values in 'Immunocompromised' column with False
merged_patient_detail['Immunocompromised'].fillna(False, inplace=True)


In [54]:
merged_patient_detail['Immunocompromised'].value_counts()

Immunocompromised
True     450397
False     17927
Name: count, dtype: int64

In [50]:
patient_detail_df = pd.read_csv('patient_cohort/patient_detail.csv')

In [None]:
additional_dfs = [
    ('recent_surgeries', patient_recent_surgeries_df),
    ('vital_signs', patient_vital_signs_df)
]

In [67]:
final_cohort_features = final_cohort.copy()

In [None]:
patient_detail_df
final_invasive_device
final_lab_results
patient_recent_surgeries_df
patient_vital_signs_df
final_cohort_features

In [70]:
# List of all the additional DataFrames to be merged
additional_dfs = [
    ('invasive_device', final_invasive_device), 
    ('lab_results', final_lab_results),
    ('patient_detail', patient_detail_df),
    ('recent_surgeries', patient_recent_surgeries_df),
    ('vital_signs', patient_vital_signs_df)
]


# Iterate over each additional DataFrame to merge it with the final cohort
for i, (name, df) in enumerate(additional_dfs):
    print(f"i: {i}, name: {name}")
    # Determine the common columns to join on
    common_cols = ['subject_id', 'hadm_id', 'stay_id']
    join_cols = [col for col in common_cols if col in df.columns]
    
    # Merge the DataFrame into the final cohort
    suffix = '_from_{}'.format(name)
    final_cohort = pd.merge(final_cohort, df, on=join_cols, suffixes=('_from_cohort', suffix), how='left')

i: 0, name: vital_signs


In [123]:
new_final_cohort = final_cohort.copy()

additional_dfs = [
    ('invasive_device', final_invasive_device), 
    ('lab_results', final_lab_results),
    ('patient_detail', patient_detail_df),
    ('recent_surgeries', patient_recent_surgeries_df),
    ('vital_signs', patient_vital_signs_df)
]

In [130]:
merged_df = pd.merge(final_cohort, final_cohort_labelled, how='left', on=['subject_id', 'InvasiveDevice'])


In [134]:
final_cohort = pd.read_csv('final_cohort.csv')

In [136]:
final_cohort.head()

Unnamed: 0,subject_id,hadm_id,stay_id,InvasiveDevice
0,14792425,21700173,31387676.0,Central Line
1,14792425,21700173,31387676.0,Central Line
2,15104738,27512296,35483572.0,"Central Line, Ventilator"
3,13282748,28141617,37907191.0,Central Line
4,13282748,28141617,37907191.0,Central Line


In [135]:
len(final_cohort)

61155

In [None]:
merged_df = pd.merge(new_final_cohort, final_cohort_labelled[['subject_id', 'hadm_id', 'stay_id', 'HAI_category']],
                     on=['subject_id', 'hadm_id', 'stay_id'], how='left')

merged_df.head()

In [125]:
merged_df.head()

Unnamed: 0,subject_id,hadm_id_x,stay_id_x,InvasiveDevice,hadm_id_y,stay_id_y,HAI_category
0,14792425,21700173,31387676.0,Central Line,21700173,31387676.0,No HAI
1,14792425,21700173,31387676.0,Central Line,21700173,31387676.0,No HAI
2,14792425,21700173,31387676.0,Central Line,24361584,31257950.0,No HAI
3,14792425,21700173,31387676.0,Central Line,24361584,31257950.0,No HAI
4,14792425,21700173,31387676.0,Central Line,28539848,-1.0,No HAI


In [86]:
final_invasive_device_agg = final_invasive_device.groupby(['stay_id']).mean().reset_index()

len(final_invasive_device_agg)

53742

In [87]:
final_lab_results_agg = final_lab_results.groupby(['stay_id']).mean().reset_index()

len(final_lab_results_agg)

39153

In [89]:
patient_detail_df_agg = patient_detail_df.groupby(['subject_id', 'hadm_id']).first().reset_index()

len(patient_detail_df_agg)

44246

In [90]:
patient_recent_surgeries_df_agg = patient_recent_surgeries_df.groupby(['subject_id', 'hadm_id', 'stay_id']).mean().reset_index()

len(patient_recent_surgeries_df_agg)

11399

In [91]:
patient_vital_signs_df_agg = patient_vital_signs_df.groupby(['subject_id', 'hadm_id', 'stay_id']).mean().reset_index()

len(patient_vital_signs_df_agg)

29604

In [92]:
from pandas.api.types import is_numeric_dtype

# List to hold aggregated DataFrames
aggregated_dfs = [
    ('invasive_device', final_invasive_device_agg), 
    ('lab_results', final_lab_results_agg),
    ('patient_detail', patient_detail_df_agg),
    ('recent_surgeries', patient_recent_surgeries_df_agg),
    ('vital_signs', patient_vital_signs_df_agg)
]

# Now merge the aggregated DataFrames into final_cohort
for i, (name, df) in enumerate(aggregated_dfs):
    print(f"i: {i}, name: {name}")
    # Determine the common columns to join on
    join_cols = [col for col in common_cols if col in df.columns]
    
    # Merge the DataFrame into the final cohort
    suffix = f'_from_{name}'
    new_final_cohort = pd.merge(new_final_cohort, df, on=join_cols, suffixes=('_from_cohort', suffix), how='left')


i: 0, name: invasive_device
i: 1, name: lab_results
i: 2, name: patient_detail
i: 3, name: recent_surgeries
i: 4, name: vital_signs


In [95]:
new_final_cohort.to_csv('final_cohort_features.csv', index=False)

In [98]:
final_cohort_labelled = pd.read_csv('final_cohort_labled.csv')

In [99]:
final_cohort_labelled.head(1)

Unnamed: 0,subject_id,hadm_id,stay_id,InvasiveDevice,HAI_category
0,14792425,21700173,31387676.0,Central Line,No HAI


In [109]:
# Define the common columns to join on
common_cols = ['subject_id', 'hadm_id', 'InvasiveDevice']

# Merge the DataFrames
merged_df = pd.merge(new_final_cohort, final_cohort_labelled, on=common_cols)




In [110]:
len(merged_df)

176248

In [108]:
new_final_cohort.head(5)

Unnamed: 0,subject_id,hadm_id,stay_id,InvasiveDevice,value,duration_from_cohort,type_condom_cath,type_foley,type_ileoconduit,type_l_nephrostomy,...,maxtemperature,stddevtemperature,variancetemperature,percentile_cont_0_25temperature,percentile_cont_0_75temperature,counttemperature,time_with_Hyperthermia,time_with_Hypothermia,episodes_of_Hyperthermia,episodes_of_Hypothermia
0,14792425,21700173,31387676.0,Central Line,15538.0,345600.0,0.0,1.0,0.0,0.0,...,98.6,29.931844,895.915282,37.1,98.0,40.0,69840.0,0.0,15.0,0.0
1,14792425,21700173,31387676.0,Central Line,15538.0,345600.0,0.0,1.0,0.0,0.0,...,98.6,29.931844,895.915282,37.1,98.0,40.0,69840.0,0.0,15.0,0.0
2,15104738,27512296,35483572.0,"Central Line, Ventilator",13460.0,817200.0,0.0,1.0,0.0,0.0,...,101.1,30.526591,931.872742,37.0,99.2,93.0,177900.0,0.0,56.0,0.0
3,13282748,28141617,37907191.0,Central Line,,,,,,,...,99.0,0.414651,0.171935,97.6,98.2,63.0,310380.0,0.0,63.0,0.0
4,13282748,28141617,37907191.0,Central Line,,,,,,,...,99.0,0.414651,0.171935,97.6,98.2,63.0,310380.0,0.0,63.0,0.0


In [112]:
final_cohort_labelled.head()

63036

In [105]:
print(len(new_final_cohort))
print(len(final_cohort_labelled))

61155
63036


In [115]:
merged_df = pd.merge(new_final_cohort, final_cohort_labelled, how='left', on=['subject_id', 'InvasiveDevice'])

In [120]:
final_cohort_features_labelled = pd.read_csv('final_cohort_features_label.csv')

In [127]:
merged_df = pd.merge(new_final_cohort, final_cohort_labelled[['subject_id', 'hadm_id', 'stay_id', 'HAI_category']],
                     on=['subject_id', 'hadm_id', 'stay_id'], how='left')

merged_df.head()

Unnamed: 0,subject_id,hadm_id,stay_id,InvasiveDevice,HAI_category
0,14792425,21700173,31387676.0,Central Line,No HAI
1,14792425,21700173,31387676.0,Central Line,No HAI
2,14792425,21700173,31387676.0,Central Line,No HAI
3,14792425,21700173,31387676.0,Central Line,No HAI
4,15104738,27512296,35483572.0,"Central Line, Ventilator",No HAI


In [128]:
merged_df['HAI_category'].value_counts()

HAI_category
No HAI    76245
HAP       23430
CAUTI      9187
CLABSI     4020
Name: count, dtype: int64

In [129]:
len(merged_df)

112882

In [138]:
result_df = pd.read_csv('result_df_features.csv')

  result_df = pd.read_csv('result_df_features.csv')


In [142]:
result_df['HAI_category'].fillna('No HAI', inplace=True)

In [143]:
result_df.to_csv('patient_cohort_features.csv', index=False)

In [121]:
len(final_cohort_features_labelled)

98096