# Import Necessary Headers

In [1]:
import pandas as pd
import numpy as np

# Load Data Sources into Pandas Dataframes

In [2]:
# Hong et al. data set
df_hong = pd.read_csv("hong_et_al/df.csv")

In [3]:
# MIMIC-IV data extracts necessary for egfr calculation
df_all_patients = pd.read_pickle('mimic_iv_extract/df_all_patients.pkl')
df_lab_events = pd.read_pickle('mimic_iv_extract/df_lab_events.pkl')
df_ckd_lab_items = pd.read_pickle('mimic_iv_extract/df_ckd_lab_items.pkl')

# Helper Function for eGFR Calculation

In [4]:
def calc_eGFR(df, crt_col, sex_col, sex_flg, age_col):
    """
    Vectorized calculation approach to estimated Glomerular Filtration Rate (eGFR) using the CKD-EPI formula.
    
    Parameters:
    - df (pandas.DataFrame): Dataframe with patient data including 'gender', 'age', and 'creatinine_last'.
    
    Returns:
    - pandas.Series: A series representing the eGFR for each patient.
    """

    # Calculate numpy series for constants
    k = np.where(df[sex_col] == sex_flg, 0.7, 0.9)
    alpha = np.where(df[sex_col] == sex_flg, -0.241, -0.302)
    gender_scalar = np.where(df[sex_col] == sex_flg, 1.012, 1.0)
    
    # Calculate pandas series for Serum Creatine / k and min and max components
    scr_k_ratio = df[crt_col] / k
    min_scr_k = np.minimum(scr_k_ratio, 1)
    max_scr_k = np.maximum(scr_k_ratio, 1)
    
    # Calculate eGFR via vectorized operation
    eGFR = 142 * (min_scr_k ** alpha) * (max_scr_k ** -1.2) * (0.9938 ** df[age_col]) * gender_scalar
    
    return eGFR

# Helper Function for CKD Stage Labeling

In [5]:
def label_CKD_stage(df, egfr_col):
    """
    Labels the stage of chronic kidney disease based on patient eGFR levels.

    Parameters:
    - egfr_srs (pandas.Series): A pandas Series containing eGFR levels of patients.

    Returns:
    - pandas.Series: A series representing the CKD stage (1-5) for each patient.
    """
    def label_patient(egfr):
        if pd.isna(egfr):
            return np.nan
        elif egfr >= 90:
            return 1
        elif egfr >= 60:
            return 2
        elif egfr >= 30:
            return 3
        elif egfr >= 15:
            return 4
        else:
            return 5

    return df[egfr_col].apply(label_patient)

# Initial View of Hong et al. Dataset

In [6]:
# Initial view of the Hong dataframe
df_hong.head(3)

Unnamed: 0,dep_name,esi,age,gender,ethnicity,race,lang,religion,maritalstatus,employstatus,...,cc_vaginaldischarge,cc_vaginalpain,cc_weakness,cc_wheezing,cc_withdrawal-alcohol,cc_woundcheck,cc_woundinfection,cc_woundre-evaluation,cc_wristinjury,cc_wristpain
0,B,4.0,40.0,Male,Hispanic or Latino,White or Caucasian,English,,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,B,4.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,B,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Insert New eGFR and CKD Stage Columns into Hong et al. Dataset

In [7]:
# Calculate eGFR column via helper func
egfr = calc_eGFR(df_hong, 'creatinine_last', 'gender', 'Female', 'age')

# Calculate insert position and insert new eGFR column
ins_pos = df_hong.columns.get_loc('egfr_last') - 1
df_hong.insert(ins_pos, 'egfr_CKD_EPI', egfr)

# Calculate CKD stage column via helper func
CKD_stage = label_CKD_stage(df_hong, 'egfr_CKD_EPI')

# Calculate insert position and insert CKD stage column
ins_pos = df_hong.columns.get_loc('chrkidneydisease') + 1
df_hong.insert(ins_pos, 'chrkidneydisease_Stg', CKD_stage)

# Replace 'inf' with NaN in the inserted columns
df_hong.replace([np.inf, -np.inf], np.nan, inplace=True)

# Summary Statistics for Hong et al. Dataset

In [8]:
# Compute summary statistics for eGFR
summary_stats = df_hong['egfr_CKD_EPI'].describe()
skewness = df_hong['egfr_CKD_EPI'].skew()
kurtosis = df_hong['egfr_CKD_EPI'].kurt()

In [9]:

print(summary_stats)
print(f"eGFR Skewness: {skewness}")
print(f"eGFR Kurtosis: {kurtosis}")

count    212834.000000
mean         86.757274
std          32.423386
min           0.922680
25%          65.844258
50%          93.331470
75%         111.359570
max         231.654835
Name: egfr_CKD_EPI, dtype: float64
eGFR Skewness: -0.6620260226924565
eGFR Kurtosis: -0.28862954490861936


In [10]:
# Frequency counts for each CKD stage
print("--- Patient Frequency by CKD Stage ---\n")
ckd_stage_counts = df_hong['chrkidneydisease_Stg'].value_counts()
print(ckd_stage_counts)

# Proportions for each CKD stage
print("--- Patient Frequency Distribution by CKD Stage ---\n")
ckd_stage_proportions = df_hong['chrkidneydisease_Stg'].value_counts(normalize=True) * 100
print(ckd_stage_proportions)

--- Patient Frequency by CKD Stage ---

chrkidneydisease_Stg
1.0    114709
2.0     53019
3.0     30035
4.0      7849
5.0      7279
Name: count, dtype: int64
--- Patient Frequency Distribution by CKD Stage ---

chrkidneydisease_Stg
1.0    53.881564
2.0    24.904294
3.0    14.108159
4.0     3.686863
5.0     3.419121
Name: proportion, dtype: float64


# Retrieve Unique Patients and Hospitalisations Associated with CKD Related Lab Events in the MIMIC-IV Dataset

In [11]:
# no of patients and hospital admissions related to icd9 data in MIMIC-IV
no_of_unique_patients = df_lab_events.subject_id.nunique()
no_of_unique_hospitalisations = df_lab_events.hadm_id.nunique()
print(f"There are {no_of_unique_patients:,} unique patients and {no_of_unique_hospitalisations:,} unique hospitalisations associated with CKD related lab events in MIMIC-IV.")

There are 251,754 unique patients and 344,150 unique hospitalisations associated with CKD related lab events in MIMIC-IV.


# Retrieve First and Last Recorded Creatinine Serum Lab Values for All MIMIC-IV Patients

In [12]:
# Extract all Creatinine Serum lab events
creat_srm = df_lab_events[df_lab_events['itemid'] == 51081]

# Sort by charttime to ensure we get the first and last correctly
creat_srm_sorted = creat_srm.sort_values(by=['subject_id', 'charttime'])

# Getting first and last entries for each subject_id
creat_srm_first = creat_srm_sorted.groupby('subject_id').first().reset_index()
creat_srm_last = creat_srm_sorted.groupby('subject_id').last().reset_index()

# Merge to get gender and anchor_age from df_all_patients
creat_srm_first_clean = pd.merge(creat_srm_first, df_all_patients[['subject_id', 'gender', 'anchor_age']], on='subject_id', how='left')
creat_srm_last_clean = pd.merge(creat_srm_last, df_all_patients[['subject_id', 'gender', 'anchor_age']], on='subject_id', how='left')

# Rename 'value' column to 'creatinine_serum_first' and 'creatinine_serum_last' respectively
creat_srm_first_clean.rename(columns={'value': 'creatinine_serum_first'}, inplace=True)
creat_srm_last_clean.rename(columns={'value': 'creatinine_serum_last'}, inplace=True)

# Convert 'creatinine_serum_first' and 'creatinine_serum_last' to numeric, coercing errors to NaN
creat_srm_first_clean['creatinine_serum_first'] = pd.to_numeric(creat_srm_first_clean['creatinine_serum_first'], errors='coerce')
creat_srm_last_clean['creatinine_serum_last'] = pd.to_numeric(creat_srm_last_clean['creatinine_serum_last'], errors='coerce')

# Drop rows with NaN values in 'creatinine_serum_first' and/or 'creatinine_serum_last'
creat_srm_first_clean = creat_srm_first_clean.dropna(subset=['creatinine_serum_first'])
creat_srm_last_clean = creat_srm_last_clean.dropna(subset=['creatinine_serum_last'])

display(creat_srm_first_clean.head(3))
display(creat_srm_last_clean.head(3))

Unnamed: 0,subject_id,hadm_id,itemid,charttime,creatinine_serum_first,valueuom,gender,anchor_age
1,10029345,,51081,2163-07-23 11:32:00,1.5,mg/dL,M,47
2,10044685,,51081,2176-01-20 11:00:00,1.2,mg/dL,M,30
5,10061731,,51081,2188-08-12 08:30:00,1.6,mg/dL,M,64


Unnamed: 0,subject_id,hadm_id,itemid,charttime,creatinine_serum_last,valueuom,gender,anchor_age
1,10029345,,51081,2163-07-23 11:32:00,1.5,mg/dL,M,47
2,10044685,,51081,2176-01-20 11:00:00,1.2,mg/dL,M,30
5,10061731,,51081,2188-08-12 08:30:00,1.6,mg/dL,M,64


# Insert New eGFR and CKD Stage Columns into MIMIC-IV Dataset

In [13]:
# Calculate eGFR for first and last measurements
creat_srm_first_clean['eGFR_first'] = calc_eGFR(creat_srm_first_clean, 'creatinine_serum_first', 'gender', 'F', 'anchor_age')
creat_srm_last_clean['eGFR_last'] = calc_eGFR(creat_srm_last_clean, 'creatinine_serum_last', 'gender', 'F', 'anchor_age')

# Selecting relevant columns to form creat_srm_first and creat_srm_last as specified
egfr_first = creat_srm_first_clean[['subject_id', 'gender', 'creatinine_serum_first', 'eGFR_first']]
egfr_last = creat_srm_last_clean[['subject_id', 'gender', 'creatinine_serum_last', 'eGFR_last']]

# Calculate CKD stage column via helper func
CKD_stage_first = label_CKD_stage(egfr_first, 'eGFR_first')
CKD_stage_last = label_CKD_stage(egfr_last, 'eGFR_last')

# Insert CKD stage columns
egfr_first['CKD_stage_first'] = CKD_stage_first
egfr_last['CKD_stage_last'] = CKD_stage_last

# Merge the last time and first time data frames
CKD_lab_summary = pd.merge(
    egfr_first,
    egfr_last[['subject_id', 'creatinine_serum_last', 'eGFR_last', 'CKD_stage_last']],
    on='subject_id',
    how='inner'  # Use 'inner' to keep only those subjects present in both frames
)

CKD_lab_summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  egfr_first['CKD_stage_first'] = CKD_stage_first
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  egfr_last['CKD_stage_last'] = CKD_stage_last


Unnamed: 0,subject_id,gender,creatinine_serum_first,eGFR_first,CKD_stage_first,creatinine_serum_last,eGFR_last,CKD_stage_last
0,10029345,M,1.5,57.427841,3,1.5,57.427841,3
1,10044685,M,1.2,83.431804,2,1.2,83.431804,2
2,10061731,M,1.6,47.815741,3,1.6,47.815741,3
3,10116310,F,1.5,36.116148,3,1.5,36.116148,3
4,10133075,F,1.1,70.191912,2,1.0,78.697021,2
...,...,...,...,...,...,...,...,...
312,19877618,M,1.7,51.297871,3,1.7,51.297871,3
313,19901341,F,1.2,53.458109,3,1.2,53.458109,3
314,19908221,M,2.5,28.339284,4,2.5,28.339284,4
315,19930120,F,0.6,107.262924,1,0.6,107.262924,1


# Compute eGFR Summary Statistics for MIMIC-IV Dataset

In [14]:
# Compute summary statistics for first eGFR calculations
first_stats = CKD_lab_summary['eGFR_first'].describe()
first_skew = CKD_lab_summary['eGFR_first'].skew()
first_kurtosis = CKD_lab_summary['eGFR_first'].kurt()

# Compute summary statistics for first eGFR calculations
last_stats = CKD_lab_summary['eGFR_last'].describe()
last_skew = CKD_lab_summary['eGFR_last'].skew()
last_kurtosis = CKD_lab_summary['eGFR_last'].kurt()

In [15]:
print(first_stats)
print(f"First eGFR Skewness: {first_skew}")
print(f"First eGFR Kurtosis: {first_kurtosis}")

count         317.0
mean      73.249315
std       33.252441
min        2.532546
25%       50.796604
50%       76.000512
75%       96.999366
max      185.184697
Name: eGFR_first, dtype: Float64
First eGFR Skewness: -0.05380684464258025
First eGFR Kurtosis: -0.35704328546629327


In [16]:
# Frequency counts for each CKD stage
print("--- Patient Frequency by CKD Stage ---\n")

ckd_stage_counts = CKD_lab_summary['CKD_stage_first'].value_counts()
print(ckd_stage_counts)

ckd_stage_counts = CKD_lab_summary['CKD_stage_last'].value_counts()
print(ckd_stage_counts)

# Proportions for each CKD stage
print("--- Patient Frequency Distribution by CKD Stage ---\n")

ckd_stage_proportions = CKD_lab_summary['CKD_stage_first'].value_counts(normalize=True) * 100
print(ckd_stage_proportions)

ckd_stage_proportions = CKD_lab_summary['CKD_stage_last'].value_counts(normalize=True) * 100
print(ckd_stage_proportions)

--- Patient Frequency by CKD Stage ---

CKD_stage_first
1    109
2     99
3     67
4     25
5     17
Name: count, dtype: int64
CKD_stage_last
1    108
2     95
3     72
4     25
5     17
Name: count, dtype: int64
--- Patient Frequency Distribution by CKD Stage ---

CKD_stage_first
1    34.384858
2    31.230284
3    21.135647
4     7.886435
5     5.362776
Name: proportion, dtype: float64
CKD_stage_last
1    34.069401
2    29.968454
3    22.712934
4     7.886435
5     5.362776
Name: proportion, dtype: float64


# Write Updated Frames and Data Summaries External Files

In [19]:
# Write updated Hong dataframe to CSV file
df_hong.to_csv("hong_et_al/df_updt.csv", index=False)

# Write CKD Lab Summary dataframe to Excel File
CKD_lab_summary.to_excel('mimic_iv_extract/df_ckd_lab_summary.xlsx', index=False)