# Import Necessary Headers

In [1]:
import pandas as pd
import numpy as np

# Load CSV into Pandas Dataframe

In [2]:
df = pd.read_csv("df.csv")

In [3]:
# Initial view of the frame
display(df)

Unnamed: 0,dep_name,esi,age,gender,ethnicity,race,lang,religion,maritalstatus,employstatus,...,cc_vaginaldischarge,cc_vaginalpain,cc_weakness,cc_wheezing,cc_withdrawal-alcohol,cc_woundcheck,cc_woundinfection,cc_woundre-evaluation,cc_wristinjury,cc_wristpain
0,B,4.0,40.0,Male,Hispanic or Latino,White or Caucasian,English,,Single,Full Time,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,B,4.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,B,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A,2.0,66.0,Male,Hispanic or Latino,Native Hawaiian or Other Pacific Islander,English,Pentecostal,Married,Not Employed,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A,3.0,84.0,Female,Hispanic or Latino,Other,Other,Pentecostal,Widowed,Retired,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560481,A,2.0,49.0,Male,Non-Hispanic,White or Caucasian,English,Episcopal,Single,Disabled,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
560482,A,3.0,50.0,Male,Non-Hispanic,White or Caucasian,English,Episcopal,Single,Disabled,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
560483,A,3.0,50.0,Male,Non-Hispanic,White or Caucasian,English,Episcopal,Single,Disabled,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
560484,A,3.0,50.0,Male,Non-Hispanic,White or Caucasian,English,Episcopal,Single,Disabled,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Helper Function for eGFR Calculation

In [4]:
def calc_eGFR(df):
    """
    Vectorized calculation approach to estimated Glomerular Filtration Rate (eGFR) using the CKD-EPI formula.
    
    Parameters:
    - df (pandas.DataFrame): Dataframe with patient data including 'gender', 'age', and 'creatinine_last'.
    
    Returns:
    - pandas.Series: A series representing the eGFR for each patient.
    """

    # Calculate numpy series for constants
    k = np.where(df['gender'] == 'Female', 0.7, 0.9)
    alpha = np.where(df['gender'] == 'Female', -0.241, -0.302)
    gender_scalar = np.where(df['gender'] == 'Female', 1.012, 1.0)
    
    # Calculate pandas series for Serum Creatine / k and min and max components
    scr_k_ratio = df['creatinine_last'] / k
    min_scr_k = np.minimum(scr_k_ratio, 1)
    max_scr_k = np.maximum(scr_k_ratio, 1)
    
    # Calculate eGFR via vectorized operation
    eGFR = 142 * (min_scr_k ** alpha) * (max_scr_k ** -1.2) * (0.9938 ** df['age']) * gender_scalar
    
    return eGFR

# Helper Function for CKD Stage Labeling

In [5]:
def label_CKD_stage(egfr_srs):
    """
    Labels the stage of chronic kidney disease based on patient eGFR levels.

    Parameters:
    - egfr_srs (pandas.Series): A pandas Series containing eGFR levels of patients.

    Returns:
    - pandas.Series: A series representing the CKD stage (1-5) for each patient.
    """
    def label_patient(egfr):
        if pd.isna(egfr):
            return np.nan
        elif egfr >= 90:
            return 1
        elif egfr >= 60:
            return 2
        elif egfr >= 30:
            return 3
        elif egfr >= 15:
            return 4
        else:
            return 5

    return egfr_srs.apply(label_patient)

# Insert New eGFR and CKD Stage Columns

In [6]:
# Calculate eGFR column via helper func
egfr = calc_eGFR(df)

# Calculate insert position and insert new eGFR column
ins_pos = df.columns.get_loc('egfr_last') - 1
df.insert(ins_pos, 'egfr_CKD_EPI', egfr)

# Calculate CKD stage column via helper func
CKD_stage = label_CKD_stage(egfr)

# Calculate insert position and insert CKD stage column
ins_pos = df.columns.get_loc('chrkidneydisease') + 1
df.insert(ins_pos, 'chrkidneydisease_Stg', CKD_stage)

# Replace 'inf' with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Compute eGFR Summary Statistics 

In [7]:
# Compute summary statistics for eGFR
summary_stats = df['egfr_CKD_EPI'].describe()
skewness = df['egfr_CKD_EPI'].skew()
kurtosis = df['egfr_CKD_EPI'].kurt()

print(summary_stats)
print(f"eGFR Skewness: {skewness}")
print(f"eGFR Kurtosis: {kurtosis}")

count    212834.000000
mean         86.757274
std          32.423386
min           0.922680
25%          65.844258
50%          93.331470
75%         111.359570
max         231.654835
Name: egfr_CKD_EPI, dtype: float64
eGFR Skewness: -0.6620260226924565
eGFR Kurtosis: -0.28862954490861936


# Compute CKD Stage Summary Statistics

In [8]:
# Frequency counts for each CKD stage
ckd_stage_counts = df['chrkidneydisease_Stg'].value_counts()
print(ckd_stage_counts)

# Proportions for each CKD stage
ckd_stage_proportions = df['chrkidneydisease_Stg'].value_counts(normalize=True) * 100
print(ckd_stage_proportions)

chrkidneydisease_Stg
1.0    114709
2.0     53019
3.0     30035
4.0      7849
5.0      7279
Name: count, dtype: int64
chrkidneydisease_Stg
1.0    53.881564
2.0    24.904294
3.0    14.108159
4.0     3.686863
5.0     3.419121
Name: proportion, dtype: float64


# Write Updated Frame to CSV File

In [9]:
# Write updated frame to CSV
df.to_csv("df_updt.csv", index=False)