# Import Necessary Headers

In [9]:
import pandas as pd
import numpy as np

In [23]:
df_all_patients = pd.read_pickle('mimic_iv_extract/df_all_patients.pkl')

In [10]:
df_lab_events = pd.read_pickle('mimic_iv_extract/df_lab_events.pkl')

In [11]:
df_ckd_lab_items = pd.read_pickle('mimic_iv_extract/df_ckd_lab_items.pkl')

# Helper Function for eGFR Calculation

In [51]:
def calc_eGFR(df, col):
    """
    Vectorized calculation approach to estimated Glomerular Filtration Rate (eGFR) using the CKD-EPI formula.
    
    Parameters:
    - df (pandas.DataFrame): Dataframe with patient data including 'gender', 'age', and 'creatinine_last'.
    
    Returns:
    - pandas.Series: A series representing the eGFR for each patient.
    """

    # Calculate numpy series for constants
    k = np.where(df['gender'] == 'F', 0.7, 0.9)
    alpha = np.where(df['gender'] == 'F', -0.241, -0.302)
    gender_scalar = np.where(df['gender'] == 'F', 1.012, 1.0)
    
    # Calculate pandas series for Serum Creatine / k and min and max components
    scr_k_ratio = df[col] / k
    min_scr_k = np.minimum(scr_k_ratio, 1)
    max_scr_k = np.maximum(scr_k_ratio, 1)
    
    # Calculate eGFR via vectorized operation
    eGFR = 142 * (min_scr_k ** alpha) * (max_scr_k ** -1.2) * (0.9938 ** df['anchor_age']) * gender_scalar
    
    return eGFR

# Helper Function for CKD Stage Labeling

In [66]:
def label_CKD_stage(df, col):
    """
    Labels the stage of chronic kidney disease based on patient eGFR levels.

    Parameters:
    - egfr_srs (pandas.Series): A pandas Series containing eGFR levels of patients.

    Returns:
    - pandas.Series: A series representing the CKD stage (1-5) for each patient.
    """
    def label_patient(egfr):
        if pd.isna(egfr):
            return np.nan
        elif egfr >= 90:
            return 1
        elif egfr >= 60:
            return 2
        elif egfr >= 30:
            return 3
        elif egfr >= 15:
            return 4
        else:
            return 5

    return df[col].apply(label_patient)

# Retrieve Number of Unique Patients and Hospitalisations

In [14]:
# no of patients and hospital admissions related to icd9 data in MIMIC-IV
no_of_unique_patients = df_lab_events.subject_id.nunique()
no_of_unique_hospitalisations = df_lab_events.hadm_id.nunique()
print(f"There are {no_of_unique_patients:,} unique patients and {no_of_unique_hospitalisations:,} unique hospitalisations associated with CKD related lab events in MIMIC-IV.")

There are 251,754 unique patients and 344,150 unique hospitalisations associated with CKD related lab events in MIMIC-IV.


# Construct a Dataframe of Number of Patients and Hospitalisations Associated with each CKD Lab Item

In [21]:
# number of patients with each CKD related lab item
df_lab_events_patients = df_lab_events.groupby('itemid').subject_id.nunique().to_frame().reset_index()

# number of hospital entries with each CKD related lab item
df_lab_events_hosp = df_lab_events.groupby('itemid').hadm_id.nunique().to_frame().reset_index()

# merge the two tables
df_lab_events_patients_hosp = pd.merge(df_lab_events_patients, df_lab_events_hosp, on='itemid', how='outer')

# add description
df_lab_events_summary = pd.merge(df_lab_events_patients_hosp, df_ckd_lab_items, on='itemid', how='left')

# rename columns
df_lab_events_summary.columns = ['itemid', 'No. of patients', 'No. of hospital entries', 'label', 'fluid', 'category']

df_lab_events_summary

Unnamed: 0,itemid,No. of patients,No. of hospital entries,label,fluid,category
0,50808,40415,42311,Free_Calcium,Blood,Blood Gas
1,50811,36441,29112,Hemoglobin,Blood,Blood Gas
2,50853,30207,5072,25-OH_Vitamin_D,Blood,Chemistry
3,50861,161076,154178,Alanine_Aminotransferase_(ALT),Blood,Chemistry
4,50862,129769,111433,Albumin,Blood,Chemistry
...,...,...,...,...,...,...
80,51474,5103,5010,Eosinophils,Urine,Hematology
81,51492,188242,126564,Protein,Urine,Hematology
82,51493,147675,94293,RBC,Urine,Hematology
83,51494,92,56,RBC_Casts,Urine,Hematology


# Retrieve First and Last Recorded Creatinine Serum Lab Values for All Patients

In [69]:
# Extract all Creatinine Serum lab events
creat_srm = df_lab_events[df_lab_events['itemid'] == 51081]

# Sort by charttime to ensure we get the first and last correctly
creat_srm_sorted = creatinine_srm.sort_values(by=['subject_id', 'charttime'])

# Getting first and last entries for each subject_id
creat_srm_first = creat_srm_sorted.groupby('subject_id').first().reset_index()
creat_srm_last = creat_srm_sorted.groupby('subject_id').last().reset_index()

# Merge to get gender and anchor_age from df_all_patients
creat_srm_first_clean = pd.merge(creat_srm_first, df_all_patients[['subject_id', 'gender', 'anchor_age']], on='subject_id', how='left')
creat_srm_last_clean = pd.merge(creat_srm_last, df_all_patients[['subject_id', 'gender', 'anchor_age']], on='subject_id', how='left')

# Rename 'value' column to 'creatinine_serum_first' and 'creatinine_serum_last' respectively
creat_srm_first_clean.rename(columns={'value': 'creatinine_serum_first'}, inplace=True)
creat_srm_last_clean.rename(columns={'value': 'creatinine_serum_last'}, inplace=True)

# Convert 'creatinine_serum_first' and 'creatinine_serum_last' to numeric, coercing errors to NaN
creat_srm_first_clean['creatinine_serum_first'] = pd.to_numeric(creat_srm_first_clean['creatinine_serum_first'], errors='coerce')
creat_srm_last_clean['creatinine_serum_last'] = pd.to_numeric(creat_srm_last_clean['creatinine_serum_last'], errors='coerce')

# Drop rows with NaN values in 'creatinine_serum_first' and/or 'creatinine_serum_last'
creat_srm_first_clean = creat_srm_first_clean.dropna(subset=['creatinine_serum_first'])
creat_srm_last_clean = creat_srm_last_clean.dropna(subset=['creatinine_serum_last'])

display(creat_srm_first_clean.head(3))
display(creat_srm_last_clean.head(3))

Unnamed: 0,subject_id,hadm_id,itemid,charttime,creatinine_serum_first,valueuom,gender,anchor_age
1,10029345,,51081,2163-07-23 11:32:00,1.5,mg/dL,M,47
2,10044685,,51081,2176-01-20 11:00:00,1.2,mg/dL,M,30
5,10061731,,51081,2188-08-12 08:30:00,1.6,mg/dL,M,64


Unnamed: 0,subject_id,hadm_id,itemid,charttime,creatinine_serum_last,valueuom,gender,anchor_age
1,10029345,,51081,2163-07-23 11:32:00,1.5,mg/dL,M,47
2,10044685,,51081,2176-01-20 11:00:00,1.2,mg/dL,M,30
5,10061731,,51081,2188-08-12 08:30:00,1.6,mg/dL,M,64


# Insert New eGFR and CKD Stage Columns

In [68]:
# Calculate eGFR for first and last measurements
creat_srm_first_clean['eGFR_first'] = calc_eGFR(creat_srm_first_clean, 'creatinine_serum_first')
creat_srm_last_clean['eGFR_last'] = calc_eGFR(creat_srm_last_clean, 'creatinine_serum_last')

# Selecting relevant columns to form creat_srm_first and creat_srm_last as specified
egfr_first = creat_srm_first_clean[['subject_id', 'gender', 'creatinine_serum_first', 'eGFR_first']]
egfr_last = creat_srm_last_clean[['subject_id', 'gender', 'creatinine_serum_last', 'eGFR_last']]

# Calculate CKD stage column via helper func
CKD_stage_first = label_CKD_stage(egfr_first, 'eGFR_first')
CKD_stage_last = label_CKD_stage(egfr_last, 'eGFR_last')

# Insert CKD stage columns
egfr_first['CKD_stage_first'] = CKD_stage_first
egfr_last['CKD_stage_last'] = CKD_stage_last

# Merge the last time and first time data frames
CKD_lab_summary = pd.merge(
    egfr_first,
    egfr_last[['subject_id', 'creatinine_serum_last', 'eGFR_last', 'CKD_stage_last']],
    on='subject_id',
    how='inner'  # Use 'inner' to keep only those subjects present in both frames
)

CKD_lab_summary

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  creat_srm_first_clean['eGFR_first'] = calc_eGFR(creat_srm_first_clean, 'creatinine_serum_first')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  creat_srm_last_clean['eGFR_last'] = calc_eGFR(creat_srm_last_clean, 'creatinine_serum_last')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  egfr_first['CKD

Unnamed: 0,subject_id,gender,creatinine_serum_first,eGFR_first,CKD_stage_first,creatinine_serum_last,eGFR_last,CKD_stage_last
0,10029345,M,1.5,57.427841,3,1.5,57.427841,3
1,10044685,M,1.2,83.431804,2,1.2,83.431804,2
2,10061731,M,1.6,47.815741,3,1.6,47.815741,3
3,10116310,F,1.5,36.116148,3,1.5,36.116148,3
4,10133075,F,1.1,70.191912,2,1.0,78.697021,2
...,...,...,...,...,...,...,...,...
312,19877618,M,1.7,51.297871,3,1.7,51.297871,3
313,19901341,F,1.2,53.458109,3,1.2,53.458109,3
314,19908221,M,2.5,28.339284,4,2.5,28.339284,4
315,19930120,F,0.6,107.262924,1,0.6,107.262924,1


# Compute eGFR Summary Statistics 

In [70]:
# Compute summary statistics for first eGFR calculations
first_stats = CKD_lab_summary['eGFR_first'].describe()
first_skew = CKD_lab_summary['eGFR_first'].skew()
first_kurtosis = CKD_lab_summary['eGFR_first'].kurt()

# Compute summary statistics for first eGFR calculations
last_stats = CKD_lab_summary['eGFR_last'].describe()
last_skew = CKD_lab_summary['eGFR_last'].skew()
last_kurtosis = CKD_lab_summary['eGFR_last'].kurt()

In [72]:
print(first_stats)
print(f"First eGFR Skewness: {first_skew}")
print(f"First eGFR Kurtosis: {first_kurtosis}")

count         317.0
mean      73.249315
std       33.252441
min        2.532546
25%       50.796604
50%       76.000512
75%       96.999366
max      185.184697
Name: eGFR_first, dtype: Float64
First eGFR Skewness: -0.05380684464258025
First eGFR Kurtosis: -0.35704328546629327


In [74]:
print(last_stats)
print(f"Last eGFR Skewness: {last_skew}")
print(f"Last eGFR Kurtosis: {last_kurtosis}")

count         317.0
mean      72.512795
std       33.589888
min        2.532546
25%        49.77363
50%        72.42734
75%       97.868346
max      185.184697
Name: eGFR_last, dtype: Float64
Last eGFR Skewness: 0.0019733596721180223
Last eGFR Kurtosis: -0.424497992148579


# Write CKD Lab Summary Frame to Excel File

In [76]:
CKD_lab_summary.to_excel('mimic_iv_extract/df_ckd_lab_summary.xlsx', index=False)