In [1]:
import pandas as pd
import numpy as np

In [2]:
df_all_patients = pd.read_pickle('mimic_iv_extract/df_all_patients.pkl')
df_admissions = pd.read_pickle("mimic_iv_extract/df_admissions.pkl")
df_lab_events = pd.read_pickle('mimic_iv_extract/df_lab_events.pkl')
df_ckd_lab_items = pd.read_pickle('mimic_iv_extract/df_ckd_lab_items.pkl')
df_icd = pd.read_pickle('mimic_iv_extract/df_icd_codes_with_description.pkl')

In [3]:
def calc_eGFR(df, crt_col, sex_col, sex_flg, age_col):
    """
    Vectorized calculation approach to estimated Glomerular Filtration Rate (eGFR) using the CKD-EPI formula.
    
    Parameters:
    - df (pandas.DataFrame): Dataframe with patient data including 'gender', 'age', and 'creatinine_last'.
    
    Returns:
    - pandas.Series: A series representing the eGFR for each patient.
    """

    # Calculate numpy series for constants
    k = np.where(df[sex_col] == sex_flg, 0.7, 0.9)
    alpha = np.where(df[sex_col] == sex_flg, -0.241, -0.302)
    gender_scalar = np.where(df[sex_col] == sex_flg, 1.012, 1.0)
    
    # Calculate pandas series for Serum Creatine / k and min and max components
    scr_k_ratio = df[crt_col] / k
    min_scr_k = np.minimum(scr_k_ratio, 1)
    max_scr_k = np.maximum(scr_k_ratio, 1)
    
    # Calculate eGFR via vectorized operation
    eGFR = 142 * (min_scr_k ** alpha) * (max_scr_k ** -1.2) * (0.9938 ** df[age_col]) * gender_scalar
    
    return eGFR

In [4]:
def label_CKD_stage(df, egfr_col):
    """
    Labels the stage of chronic kidney disease based on patient eGFR levels.

    Parameters:
    - egfr_srs (pandas.Series): A pandas Series containing eGFR levels of patients.

    Returns:
    - pandas.Series: A series representing the CKD stage (1-5) for each patient.
    """
    def label_patient(egfr):
        if pd.isna(egfr):
            return np.nan
        elif egfr >= 90:
            return 1
        elif egfr >= 60:
            return 2
        elif egfr >= 30:
            return 3
        elif egfr >= 15:
            return 4
        else:
            return 5

    return df[egfr_col].apply(label_patient)

# Helper Function for Converting CKD Stage Title

In [5]:
def convert_CKD_stage(df, icd_col):
    """
    Labels the stage of chronic kidney disease based on patient eGFR levels.

    Parameters:
    - egfr_srs (pandas.Series): A pandas Series containing eGFR levels of patients.

    Returns:
    - pandas.Series: A series representing the CKD stage (1-5) for each patient.
    """
    def convert_label(icd_col):
        if icd_col == 5851:
            return 1
        elif icd_col == 5852:
            return 2
        elif icd_col == 5853:
            return 3
        elif icd_col == 5854:
            return 4
        elif icd_col == 5855:
            return 5
        elif icd_col == 5856:
            return 6
        elif icd_col == 5859:
            return -1

    return df[icd_col].apply(convert_label)

# Construct a Dataframe of Number of Patients and Hospitalisations with each CKD ICD-9 code

In [6]:
# subset the CKD patients using ICD-9 codes (585)
df_icd_585 = df_icd[df_icd['icd_code'].str.startswith('585')]

# number of patients with each CKD ICD-9 code
df_icd_585_patients = df_icd_585.groupby('icd_code').subject_id.nunique().to_frame().reset_index()

# number of hospital entries with each CKD ICD-9 code
df_icd_585_hosp = df_icd_585.groupby('icd_code').hadm_id.nunique().to_frame().reset_index()

# merge the two tables
df_icd_585_patients_hosp = pd.merge(df_icd_585_patients, df_icd_585_hosp, on='icd_code', how='outer')

# rename columns
df_icd_585_patients_hosp.columns = ['icd_code', 'No. of patients', 'No. of hospital entries']

# add description
df_icd_585_patients_hosp = pd.merge(df_icd_585_patients_hosp, df_icd_585[['icd_code', 'long_title']].drop_duplicates(), on='icd_code', how='left')

df_icd_585_patients_hosp

Unnamed: 0,icd_code,No. of patients,No. of hospital entries,long_title
0,5851,89,98,"Chronic kidney disease, Stage I"
1,5852,827,1105,"Chronic kidney disease, Stage II (mild)"
2,5853,4312,6996,"Chronic kidney disease, Stage III (moderate)"
3,5854,1587,2836,"Chronic kidney disease, Stage IV (severe)"
4,5855,475,682,"Chronic kidney disease, Stage V"
5,5856,2441,8882,End stage renal disease
6,5859,9032,16983,"Chronic kidney disease, unspecified"


# Construct a Dataframe of Number of Patients and Hospitalisations Associated with each CKD Related Lab Item

In [7]:
# number of patients with each CKD related lab item
df_lab_events_patients = df_lab_events.groupby('itemid').subject_id.nunique().to_frame().reset_index()

# number of hospital entries with each CKD related lab item
df_lab_events_hosp = df_lab_events.groupby('itemid').hadm_id.nunique().to_frame().reset_index()

# merge the two tables
df_lab_events_patients_hosp = pd.merge(df_lab_events_patients, df_lab_events_hosp, on='itemid', how='outer')

# add description
df_lab_events_summary = pd.merge(df_lab_events_patients_hosp, df_ckd_lab_items, on='itemid', how='left')

# rename columns
df_lab_events_summary.columns = ['itemid', 'No. of patients', 'No. of hospital entries', 'label', 'fluid', 'category']

df_lab_events_summary

Unnamed: 0,itemid,No. of patients,No. of hospital entries,label,fluid,category
0,50808,40415,42311,Free_Calcium,Blood,Blood Gas
1,50811,36441,29112,Hemoglobin,Blood,Blood Gas
2,50853,30207,5072,25-OH_Vitamin_D,Blood,Chemistry
3,50861,161076,154178,Alanine_Aminotransferase_(ALT),Blood,Chemistry
4,50862,129769,111433,Albumin,Blood,Chemistry
...,...,...,...,...,...,...
80,51474,5103,5010,Eosinophils,Urine,Hematology
81,51492,188242,126564,Protein,Urine,Hematology
82,51493,147675,94293,RBC,Urine,Hematology
83,51494,92,56,RBC_Casts,Urine,Hematology


# Construct a Dataframe of First & Last CKD Stage Diagnoses For each Patient

In [8]:
# Merge df_icd_585 with df_admissions to get the admittime and dischtime
df_icd_585_adm = pd.merge(df_icd_585, df_admissions[['subject_id', 'hadm_id', 'admittime', 'dischtime']], 
                          on=['subject_id', 'hadm_id'], how='left')

# Ensure that admittime and dischtime are in datetime format
df_icd_585_adm['admittime'] = pd.to_datetime(df_icd_585_adm['admittime'])
df_icd_585_adm['dischtime'] = pd.to_datetime(df_icd_585_adm['dischtime'])

# Identify the first CKD stage: Earliest admittime
first_ckd_stage = df_icd_585_adm.loc[df_icd_585_adm.groupby('subject_id')['admittime'].idxmin()].reset_index(drop=True)

# Identify the last CKD stage: Latest dischtime
last_ckd_stage = df_icd_585_adm.loc[df_icd_585_adm.groupby('subject_id')['dischtime'].idxmax()].reset_index(drop=True)

# Rename columns for clarity
first_ckd_stage = first_ckd_stage.rename(columns={'icd_code': 'first_stage_icd', 'long_title': 'first_long_title'})
last_ckd_stage = last_ckd_stage.rename(columns={'icd_code': 'last_stage_icd', 'long_title': 'last_long_title'})

# Merge first and last CKD stages
df_first_last_ckd = pd.merge(
    first_ckd_stage[['subject_id', 'first_stage_icd', 'first_long_title', 'hadm_id', 'admittime']],
    last_ckd_stage[['subject_id', 'last_stage_icd', 'last_long_title', 'hadm_id', 'dischtime']],
    on='subject_id', how='inner', suffixes=('_first', '_last')
)

# Cast icd code columns to numeric
df_first_last_ckd['first_stage_icd'] = pd.to_numeric(df_first_last_ckd['first_stage_icd'], errors='coerce')
df_first_last_ckd['last_stage_icd'] = pd.to_numeric(df_first_last_ckd['last_stage_icd'], errors='coerce')

# Convert ICD codes to CKD stages
df_first_last_ckd['CKD_stage_first'] = convert_CKD_stage(df_first_last_ckd, 'first_stage_icd')
df_first_last_ckd['CKD_stage_last'] = convert_CKD_stage(df_first_last_ckd, 'last_stage_icd')

# Display the result
df_first_last_ckd

Unnamed: 0,subject_id,first_stage_icd,first_long_title,hadm_id_first,admittime,last_stage_icd,last_long_title,hadm_id_last,dischtime,CKD_stage_first,CKD_stage_last
0,10000980,5853,"Chronic kidney disease, Stage III (moderate)",29654838,2188-01-03 17:41:00,5854,"Chronic kidney disease, Stage IV (severe)",25242409,2191-04-11 16:21:00,3,4
1,10002013,5859,"Chronic kidney disease, unspecified",24848509,2162-07-08 00:08:00,5859,"Chronic kidney disease, unspecified",24848509,2162-07-09 18:06:00,-1,-1
2,10002155,5859,"Chronic kidney disease, unspecified",23822395,2129-08-04 12:44:00,5859,"Chronic kidney disease, unspecified",20345487,2131-03-10 01:55:00,-1,-1
3,10003400,5853,"Chronic kidney disease, Stage III (moderate)",29483621,2136-11-04 20:43:00,5853,"Chronic kidney disease, Stage III (moderate)",20214994,2137-03-19 15:45:00,3,3
4,10003502,5859,"Chronic kidney disease, unspecified",22491625,2161-06-29 14:34:00,5859,"Chronic kidney disease, unspecified",22491625,2161-07-01 16:56:00,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...
14007,19997367,5859,"Chronic kidney disease, unspecified",21508795,2127-04-02 01:03:00,5859,"Chronic kidney disease, unspecified",24169669,2128-02-26 16:10:00,-1,-1
14008,19997911,5859,"Chronic kidney disease, unspecified",26014121,2195-11-03 19:42:00,5859,"Chronic kidney disease, unspecified",26014121,2195-11-05 12:48:00,-1,-1
14009,19998330,5852,"Chronic kidney disease, Stage II (mild)",27282608,2177-07-25 04:34:00,5853,"Chronic kidney disease, Stage III (moderate)",24096838,2178-12-01 17:10:00,2,3
14010,19998497,5859,"Chronic kidney disease, unspecified",29288061,2139-07-01 16:19:00,5854,"Chronic kidney disease, Stage IV (severe)",21557581,2145-08-01 13:04:00,-1,4


# Save Frame to Excel Sheet

In [12]:
CKD_summary = df_first_last_ckd.copy()

# save the table
CKD_summary.to_excel('mimic_iv_extract/ckd_summary.xlsx', index=False)