In [1]:
import pandas as pd
import numpy as np
from ipynb.fs.full.eGFR import calc_eGFR, label_CKD_stage

count    212834.000000
mean         86.757274
std          32.423386
min           0.922680
25%          65.844258
50%          93.331470
75%         111.359570
max         231.654835
Name: egfr_CKD_EPI, dtype: float64
eGFR Skewness: -0.6620260226924565
eGFR Kurtosis: -0.28862954490861936
count    212834.000000
mean         -0.094655
std           0.305534
min          -0.990513
25%          -0.241472
50%           0.000000
75%           0.173790
max           0.876113
Name: eGFR_decline, dtype: float64
eGFR Skewness: -0.8998051086946266
eGFR Kurtosis: 0.24300865399563776
--- Patient Frequency by CKD Stage ---

chrkidneydisease_Stg
1.0    114709
2.0     53019
3.0     30035
4.0      7849
5.0      7279
Name: count, dtype: int64
--- Patient Frequency Distribution by CKD Stage ---

chrkidneydisease_Stg
1.0    53.881564
2.0    24.904294
3.0    14.108159
4.0     3.686863
5.0     3.419121
Name: proportion, dtype: float64
There are 251,754 unique patients and 344,150 unique hospitalisations as

Unnamed: 0,subject_id,hadm_id,itemid,charttime,creatinine_serum_first,valueuom,gender,anchor_age
1,10029345,,51081,2163-07-23 11:32:00,1.5,mg/dL,M,47
2,10044685,,51081,2176-01-20 11:00:00,1.2,mg/dL,M,30
5,10061731,,51081,2188-08-12 08:30:00,1.6,mg/dL,M,64


Unnamed: 0,subject_id,hadm_id,itemid,charttime,creatinine_serum_last,valueuom,gender,anchor_age
1,10029345,,51081,2163-07-23 11:32:00,1.5,mg/dL,M,47
2,10044685,,51081,2176-01-20 11:00:00,1.2,mg/dL,M,30
5,10061731,,51081,2188-08-12 08:30:00,1.6,mg/dL,M,64


count         317.0
mean      73.249315
std       33.252441
min        2.532546
25%       50.796604
50%       76.000512
75%       96.999366
max      185.184697
Name: eGFR_first, dtype: Float64
First eGFR Skewness: -0.05380684464258025
First eGFR Kurtosis: -0.35704328546629327
--- Patient Frequency by CKD Stage ---

CKD_stage_first
1    109
2     99
3     67
4     25
5     17
Name: count, dtype: int64
CKD_stage_last
1    108
2     95
3     72
4     25
5     17
Name: count, dtype: int64
--- Patient Frequency Distribution by CKD Stage ---

CKD_stage_first
1    34.384858
2    31.230284
3    21.135647
4     7.886435
5     5.362776
Name: proportion, dtype: float64
CKD_stage_last
1    34.069401
2    29.968454
3    22.712934
4     7.886435
5     5.362776
Name: proportion, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  "      <th>cc_woundre-evaluation</th>\n",
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  "      <th>cc_wristinjury</th>\n",


In [2]:
df_all_patients = pd.read_pickle('mimic_iv_extract/df_all_patients.pkl')
df_admissions = pd.read_pickle("mimic_iv_extract/df_admissions.pkl")
df_lab_events = pd.read_pickle('mimic_iv_extract/df_lab_events.pkl')
df_ckd_lab_items = pd.read_pickle('mimic_iv_extract/df_ckd_lab_items.pkl')
df_icd = pd.read_pickle('mimic_iv_extract/df_icd_codes_with_description.pkl')

# Helper Function for Labeling CKD Stage

In [4]:
def convert_CKD_stage(df, icd_col):
    """
    Labels the stage of chronic kidney disease based on patient eGFR levels.

    Parameters:
    - egfr_srs (pandas.Series): A pandas Series containing eGFR levels of patients.

    Returns:
    - pandas.Series: A series representing the CKD stage (1-5) for each patient.
    """
    def convert_label(icd_col):
        if icd_col == 5851:
            return 1
        elif icd_col == 5852:
            return 2
        elif icd_col == 5853:
            return 3
        elif icd_col == 5854:
            return 4
        elif icd_col == 5855:
            return 5
        elif icd_col == 5856:
            return 6
        elif icd_col == 5859:
            return -1

    return df[icd_col].apply(convert_label)

# Construct a Dataframe of Number of Patients and Hospitalisations with each CKD ICD-9 code

In [5]:
# subset the CKD patients using ICD-9 codes (585)
df_icd_585 = df_icd[df_icd['icd_code'].str.startswith('585')]

# number of patients with each CKD ICD-9 code
df_icd_585_patients = df_icd_585.groupby('icd_code').subject_id.nunique().to_frame().reset_index()

# number of hospital entries with each CKD ICD-9 code
df_icd_585_hosp = df_icd_585.groupby('icd_code').hadm_id.nunique().to_frame().reset_index()

# merge the two tables
df_icd_585_patients_hosp = pd.merge(df_icd_585_patients, df_icd_585_hosp, on='icd_code', how='outer')

# rename columns
df_icd_585_patients_hosp.columns = ['icd_code', 'No. of patients', 'No. of hospital entries']

# add description
df_icd_585_patients_hosp = pd.merge(df_icd_585_patients_hosp, df_icd_585[['icd_code', 'long_title']].drop_duplicates(), on='icd_code', how='left')

df_icd_585_patients_hosp

Unnamed: 0,icd_code,No. of patients,No. of hospital entries,long_title
0,5851,89,98,"Chronic kidney disease, Stage I"
1,5852,827,1105,"Chronic kidney disease, Stage II (mild)"
2,5853,4312,6996,"Chronic kidney disease, Stage III (moderate)"
3,5854,1587,2836,"Chronic kidney disease, Stage IV (severe)"
4,5855,475,682,"Chronic kidney disease, Stage V"
5,5856,2441,8882,End stage renal disease
6,5859,9032,16983,"Chronic kidney disease, unspecified"


# Construct a Dataframe of Number of Patients and Hospitalisations Associated with each CKD Related Lab Item

In [6]:
# number of patients with each CKD related lab item
df_lab_events_patients = df_lab_events.groupby('itemid').subject_id.nunique().to_frame().reset_index()

# number of hospital entries with each CKD related lab item
df_lab_events_hosp = df_lab_events.groupby('itemid').hadm_id.nunique().to_frame().reset_index()

# merge the two tables
df_lab_events_patients_hosp = pd.merge(df_lab_events_patients, df_lab_events_hosp, on='itemid', how='outer')

# add description
df_lab_events_summary = pd.merge(df_lab_events_patients_hosp, df_ckd_lab_items, on='itemid', how='left')

# rename columns
df_lab_events_summary.columns = ['itemid', 'No. of patients', 'No. of hospital entries', 'label', 'fluid', 'category']

df_lab_events_summary

Unnamed: 0,itemid,No. of patients,No. of hospital entries,label,fluid,category
0,50808,40415,42311,Free_Calcium,Blood,Blood Gas
1,50811,36441,29112,Hemoglobin,Blood,Blood Gas
2,50853,30207,5072,25-OH_Vitamin_D,Blood,Chemistry
3,50861,161076,154178,Alanine_Aminotransferase_(ALT),Blood,Chemistry
4,50862,129769,111433,Albumin,Blood,Chemistry
...,...,...,...,...,...,...
80,51474,5103,5010,Eosinophils,Urine,Hematology
81,51492,188242,126564,Protein,Urine,Hematology
82,51493,147675,94293,RBC,Urine,Hematology
83,51494,92,56,RBC_Casts,Urine,Hematology


# Construct a Dataframe of First & Last CKD Stage Diagnoses For each Patient

In [7]:
# Merge df_icd_585 with df_admissions to get the admittime and dischtime
df_icd_585_adm = pd.merge(df_icd_585, df_admissions[['subject_id', 'hadm_id', 'admittime', 'dischtime']], 
                          on=['subject_id', 'hadm_id'], how='left')

# Ensure that admittime and dischtime are in datetime format
df_icd_585_adm['admittime'] = pd.to_datetime(df_icd_585_adm['admittime'])
df_icd_585_adm['dischtime'] = pd.to_datetime(df_icd_585_adm['dischtime'])

# Identify the first and last CKD stages correctly
# First stage: Earliest admittime
first_ckd_stage = df_icd_585_adm.loc[df_icd_585_adm.groupby('subject_id')['admittime'].idxmin()].reset_index(drop=True)

# Last stage: Latest dischtime
last_ckd_stage = df_icd_585_adm.loc[df_icd_585_adm.groupby('subject_id')['dischtime'].idxmax()].reset_index(drop=True)

# Rename columns for clarity
first_ckd_stage = first_ckd_stage.rename(columns={'icd_code': 'first_stage_icd', 'long_title': 'first_long_title'})
last_ckd_stage = last_ckd_stage.rename(columns={'icd_code': 'last_stage_icd', 'long_title': 'last_long_title'})

# Merge first and last CKD stages
df_first_last_ckd = pd.merge(
    first_ckd_stage[['subject_id', 'first_stage_icd', 'first_long_title', 'hadm_id', 'admittime']],
    last_ckd_stage[['subject_id', 'last_stage_icd', 'last_long_title', 'hadm_id', 'dischtime']],
    on='subject_id', how='inner', suffixes=('_first', '_last')
)

# Cast icd code columns to numeric
df_first_last_ckd['first_stage_icd'] = pd.to_numeric(df_first_last_ckd['first_stage_icd'], errors='coerce')
df_first_last_ckd['last_stage_icd'] = pd.to_numeric(df_first_last_ckd['last_stage_icd'], errors='coerce')

# Convert ICD codes to CKD stages
df_first_last_ckd['CKD_stage_first'] = convert_CKD_stage(df_first_last_ckd, 'first_stage_icd')
df_first_last_ckd['CKD_stage_last'] = convert_CKD_stage(df_first_last_ckd, 'last_stage_icd')

# Calculate CKD duration
df_first_last_ckd['CKD_duration'] = (df_first_last_ckd['dischtime'] - df_first_last_ckd['admittime']).dt.days

# Display the result
df_first_last_ckd

Unnamed: 0,subject_id,first_stage_icd,first_long_title,hadm_id_first,admittime,last_stage_icd,last_long_title,hadm_id_last,dischtime,CKD_stage_first,CKD_stage_last,CKD_duration
0,10000980,5853,"Chronic kidney disease, Stage III (moderate)",29654838,2188-01-03 17:41:00,5854,"Chronic kidney disease, Stage IV (severe)",25242409,2191-04-11 16:21:00,3,4,1193
1,10002013,5859,"Chronic kidney disease, unspecified",24848509,2162-07-08 00:08:00,5859,"Chronic kidney disease, unspecified",24848509,2162-07-09 18:06:00,-1,-1,1
2,10002155,5859,"Chronic kidney disease, unspecified",23822395,2129-08-04 12:44:00,5859,"Chronic kidney disease, unspecified",20345487,2131-03-10 01:55:00,-1,-1,582
3,10003400,5853,"Chronic kidney disease, Stage III (moderate)",29483621,2136-11-04 20:43:00,5853,"Chronic kidney disease, Stage III (moderate)",20214994,2137-03-19 15:45:00,3,3,134
4,10003502,5859,"Chronic kidney disease, unspecified",22491625,2161-06-29 14:34:00,5859,"Chronic kidney disease, unspecified",22491625,2161-07-01 16:56:00,-1,-1,2
...,...,...,...,...,...,...,...,...,...,...,...,...
14007,19997367,5859,"Chronic kidney disease, unspecified",21508795,2127-04-02 01:03:00,5859,"Chronic kidney disease, unspecified",24169669,2128-02-26 16:10:00,-1,-1,330
14008,19997911,5859,"Chronic kidney disease, unspecified",26014121,2195-11-03 19:42:00,5859,"Chronic kidney disease, unspecified",26014121,2195-11-05 12:48:00,-1,-1,1
14009,19998330,5852,"Chronic kidney disease, Stage II (mild)",27282608,2177-07-25 04:34:00,5853,"Chronic kidney disease, Stage III (moderate)",24096838,2178-12-01 17:10:00,2,3,494
14010,19998497,5859,"Chronic kidney disease, unspecified",29288061,2139-07-01 16:19:00,5854,"Chronic kidney disease, Stage IV (severe)",21557581,2145-08-01 13:04:00,-1,4,2222


# Construct Dataframe of First & Last Serum Creatinine and eGFR Values

In [9]:
# Extract and sort all Creatinine Serum lab events
creat_srm_sorted = df_lab_events[df_lab_events['itemid'] == 51081].sort_values(by=['subject_id', 'charttime'])

# Convert 'value' to numeric and drop NaN values
creat_srm_sorted['value'] = pd.to_numeric(creat_srm_sorted['value'], errors='coerce')
creat_srm_sorted.dropna(subset=['value'], inplace=True)

# Get first, last, and mean creatinine values and hadm_ids
creat_srm_first = creat_srm_sorted.groupby('subject_id').first().reset_index()
creat_srm_last = creat_srm_sorted.groupby('subject_id').last().reset_index()
creat_srm_mean = creat_srm_sorted.groupby('subject_id')['value'].mean().reset_index(name='creatinine_serum_mean')

# Merge with patient demographic data
patient_columns = ['subject_id', 'gender', 'anchor_age']
creat_srm_first = pd.merge(creat_srm_first, df_all_patients[patient_columns], on='subject_id', how='left')
creat_srm_last = pd.merge(creat_srm_last, df_all_patients[patient_columns], on='subject_id', how='left')
creat_srm_mean = pd.merge(creat_srm_mean, df_all_patients[patient_columns], on='subject_id', how='left')

# Rename columns for clarity
creat_srm_first.rename(columns={'value': 'creatinine_serum_first', 'hadm_id': 'hadm_id_first'}, inplace=True)
creat_srm_last.rename(columns={'value': 'creatinine_serum_last', 'hadm_id': 'hadm_id_last'}, inplace=True)

# Calculate eGFR for first, last, and mean measurements
creat_srm_first['eGFR_first'] = calc_eGFR(creat_srm_first, 'creatinine_serum_first', 'gender', 'F', 'anchor_age')
creat_srm_last['eGFR_last'] = calc_eGFR(creat_srm_last, 'creatinine_serum_last', 'gender', 'F', 'anchor_age')
creat_srm_mean['eGFR_mean'] = calc_eGFR(creat_srm_mean, 'creatinine_serum_mean', 'gender', 'F', 'anchor_age')

# Select relevant columns
egfr_first = creat_srm_first[['subject_id', 'gender', 'creatinine_serum_first', 'eGFR_first', 'hadm_id_first', 'charttime']]
egfr_last = creat_srm_last[['subject_id', 'creatinine_serum_last', 'eGFR_last', 'hadm_id_last', 'charttime']]
egfr_mean = creat_srm_mean[['subject_id', 'creatinine_serum_mean', 'eGFR_mean']]

# Merge data frames
CKD_lab_summary = pd.merge(egfr_first, egfr_last, on='subject_id', suffixes=('_first', '_last'))
CKD_lab_summary = pd.merge(CKD_lab_summary, egfr_mean, on='subject_id')

# Function to find the closest hadm_id based on the reference time
def find_closest_hadm(subject_id, reference_time, time_column):
    subject_admissions = df_admissions[df_admissions['subject_id'] == subject_id].copy()
    if subject_admissions.empty:
        return np.nan
    subject_admissions['time_diff'] = (subject_admissions[time_column] - reference_time).abs()
    return subject_admissions.loc[subject_admissions['time_diff'].idxmin(), 'hadm_id']

# Initialize columns for hadm_id_first and hadm_id_last
CKD_lab_summary['hadm_id_first'] = CKD_lab_summary.apply(
    lambda row: find_closest_hadm(row['subject_id'], row['charttime_first'], 'admittime'), axis=1)
CKD_lab_summary['hadm_id_last'] = CKD_lab_summary.apply(
    lambda row: find_closest_hadm(row['subject_id'], row['charttime_last'], 'dischtime'), axis=1)

# Merge with df_admissions to get admission and discharge times
CKD_lab_summary = pd.merge(CKD_lab_summary, 
                           df_admissions[['hadm_id', 'admittime', 'dischtime']].rename(columns={'hadm_id': 'hadm_id_first', 'admittime': 'first_admittime'}), 
                           on='hadm_id_first', how='left')
CKD_lab_summary = pd.merge(CKD_lab_summary, 
                           df_admissions[['hadm_id', 'dischtime']].rename(columns={'hadm_id': 'hadm_id_last', 'dischtime': 'last_dischtime'}), 
                           on='hadm_id_last', how='left')

# Ensure datetime format
CKD_lab_summary['first_admittime'] = pd.to_datetime(CKD_lab_summary['first_admittime'])
CKD_lab_summary['last_dischtime'] = pd.to_datetime(CKD_lab_summary['last_dischtime'])

# Calculate CKD duration
CKD_lab_summary['CKD_duration'] = (CKD_lab_summary['last_dischtime'] - CKD_lab_summary['first_admittime']).dt.days

# Handle negative durations
negative_duration_mask = CKD_lab_summary['CKD_duration'] < 0
CKD_lab_summary.loc[negative_duration_mask, ['first_admittime', 'last_dischtime']] = CKD_lab_summary.loc[negative_duration_mask, ['last_dischtime', 'first_admittime']].values
CKD_lab_summary['CKD_duration'] = (CKD_lab_summary['last_dischtime'] - CKD_lab_summary['first_admittime']).dt.days

# Handle missing CKD_duration by using first and last charttime
missing_duration_mask = CKD_lab_summary['CKD_duration'].isna()
CKD_lab_summary.loc[missing_duration_mask, 'CKD_duration'] = (
    (CKD_lab_summary.loc[missing_duration_mask, 'charttime_last'] - CKD_lab_summary.loc[missing_duration_mask, 'charttime_first']).dt.days
)

# Calculate CKD stages based on eGFR
CKD_lab_summary['CKD_stage_first'] = label_CKD_stage(CKD_lab_summary, 'eGFR_first')
CKD_lab_summary['CKD_stage_last'] = label_CKD_stage(CKD_lab_summary, 'eGFR_last')

# Display the combined dataframe
CKD_lab_summary

Unnamed: 0,subject_id,gender,creatinine_serum_first,eGFR_first,hadm_id_first,charttime_first,creatinine_serum_last,eGFR_last,hadm_id_last,charttime_last,creatinine_serum_mean,eGFR_mean,first_admittime,dischtime,last_dischtime,CKD_duration,CKD_stage_first,CKD_stage_last
0,10029345,M,1.5,57.427841,,2163-07-23 11:32:00,1.5,57.427841,,2163-07-23 11:32:00,1.50,57.427841,NaT,NaT,NaT,0.0,3,3
1,10044685,M,1.2,83.431804,,2176-01-20 11:00:00,1.2,83.431804,,2176-01-20 11:00:00,1.20,83.431804,NaT,NaT,NaT,0.0,2,2
2,10061731,M,1.6,47.815741,25629152.0,2188-08-12 08:30:00,1.6,47.815741,25629152.0,2188-08-12 08:30:00,1.60,47.815741,2189-06-08 19:17:00,2189-06-10 14:30:00,2189-06-10 14:30:00,1.0,3,3
3,10116310,F,1.5,36.116148,20714359.0,2185-01-06 14:44:00,1.5,36.116148,20714359.0,2185-01-06 14:44:00,1.50,36.116148,2184-12-05 17:04:00,2184-12-10 14:45:00,2184-12-10 14:45:00,4.0,3,3
4,10133075,F,1.1,70.191912,27954510.0,2182-04-05 06:00:00,1.0,78.697021,27954510.0,2182-08-22 13:19:00,1.05,74.22174,2182-10-29 12:45:00,2182-11-01 14:14:00,2182-11-01 14:14:00,3.0,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,19877618,M,1.7,51.297871,25694728.0,2182-06-06 14:43:00,1.7,51.297871,25694728.0,2182-06-06 14:43:00,1.70,51.297871,2180-03-12 20:00:00,2180-03-16 12:05:00,2180-03-16 12:05:00,3.0,3,3
321,19901341,F,1.2,53.458109,25532334.0,2168-05-14 06:10:00,1.2,53.458109,24115002.0,2168-05-14 06:10:00,1.20,53.458109,2168-06-13 16:56:00,2168-08-02 11:05:00,2168-06-22 15:57:00,8.0,3,3
322,19908221,M,2.5,28.339284,22779950.0,2141-03-20 20:32:00,2.5,28.339284,22779950.0,2141-03-20 20:32:00,2.50,28.339284,2141-03-17 17:35:00,2141-03-25 18:10:00,2141-03-25 18:10:00,8.0,4,4
323,19930120,F,0.6,107.262924,25486927.0,2170-12-16 21:45:00,0.6,107.262924,25486927.0,2170-12-17 14:34:00,0.60,107.262924,2170-12-11 18:43:00,2170-12-25 13:20:00,2170-12-25 13:20:00,13.0,1,1


In [10]:
CKD_lab_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   subject_id              325 non-null    Int64         
 1   gender                  325 non-null    object        
 2   creatinine_serum_first  325 non-null    float64       
 3   eGFR_first              325 non-null    Float64       
 4   hadm_id_first           300 non-null    float64       
 5   charttime_first         325 non-null    datetime64[us]
 6   creatinine_serum_last   325 non-null    float64       
 7   eGFR_last               325 non-null    Float64       
 8   hadm_id_last            300 non-null    float64       
 9   charttime_last          325 non-null    datetime64[us]
 10  creatinine_serum_mean   325 non-null    float64       
 11  eGFR_mean               325 non-null    Float64       
 12  first_admittime         300 non-null    datetime64

# Merge Dataframes and Reorder Columns

In [17]:
# Perform an outer join on 'subject_id' to include all rows from both dataframes
CKD_summary = pd.merge(
    df_first_last_ckd,
    CKD_lab_summary,
    on='subject_id',
    how='outer',
    suffixes=('_ckd', '_lab')
)

# Overwrite the CKD_stage_first and CKD_stage_last columns with the values from df_first_last_ckd where they exist
CKD_summary['CKD_stage_first'] = CKD_summary['CKD_stage_first_ckd'].combine_first(CKD_summary['CKD_stage_first_lab'])
CKD_summary['CKD_stage_last'] = CKD_summary['CKD_stage_last_ckd'].combine_first(CKD_summary['CKD_stage_last_lab'])
CKD_summary['CKD_duration'] = CKD_summary['CKD_duration_ckd'].combine_first(CKD_summary['CKD_duration_lab'])
CKD_summary['hadm_id_first'] = CKD_summary['hadm_id_first_ckd'].combine_first(CKD_summary['hadm_id_first_lab'])
CKD_summary['hadm_id_last'] = CKD_summary['hadm_id_last_ckd'].combine_first(CKD_summary['hadm_id_last_lab'])

# Copy values from 'admittime' to 'charttime_first' where 'charttime_first' is NaN
CKD_summary.loc[CKD_summary['charttime_first'].isna(), 'charttime_first'] = CKD_summary['admittime']

# Copy values from 'dischtime_ckd' to 'charttime_last' where 'charttime_last' is NaN
CKD_summary.loc[CKD_summary['charttime_last'].isna(), 'charttime_last'] = CKD_summary['dischtime_ckd']

# Copy non-NaN values from 'dischtime_lab' to 'charttime_last' where 'dischtime_lab' is not NaN
CKD_summary.loc[CKD_summary['dischtime_lab'].notna(), 'charttime_last'] = CKD_summary['dischtime_lab']

# Create a mask for rows where 'charttime_first' is greater than 'charttime_last'
swap_mask = CKD_summary['charttime_first'] > CKD_summary['charttime_last']

# Swap the values where necessary
CKD_summary.loc[swap_mask, ['charttime_first', 'charttime_last']] = CKD_summary.loc[swap_mask, ['charttime_last', 'charttime_first']].values

# Calculate CKD_duration as the difference in days between 'charttime_last' and 'charttime_first'
CKD_summary['CKD_duration'] = (CKD_summary['charttime_last'] - CKD_summary['charttime_first']).dt.days

# Define the conditions for 'stage_delta'
conditions = [
    CKD_summary['CKD_stage_first'] > CKD_summary['CKD_stage_last'],  # Condition for -1
    CKD_summary['CKD_stage_first'] < CKD_summary['CKD_stage_last'],  # Condition for 1
    CKD_summary['CKD_stage_first'] == CKD_summary['CKD_stage_last']  # Condition for 0
]

# Define the corresponding values for each condition
choices = [-1, 1, 0]

# Create the 'stage_delta' column
CKD_summary['stage_delta'] = np.select(conditions, choices, default=np.nan)

# Drop the unnecessary columns with suffixes
CKD_summary.drop(
    ['CKD_stage_first_ckd', 'CKD_stage_first_lab', 'CKD_stage_last_ckd', 'CKD_stage_last_lab', 
     'CKD_duration_ckd', 'CKD_duration_lab', 'hadm_id_first_ckd', 'hadm_id_first_lab', 'hadm_id_last_ckd', 
     'hadm_id_last_lab', 'admittime', 'dischtime_ckd', 'dischtime_lab'], 
    axis=1, inplace=True)

# Define the desired column order
column_order = [
    "subject_id",
    "stage_delta",
    "CKD_duration",
    "charttime_first",
    "charttime_last",
    "CKD_stage_first",
    "first_stage_icd",
    "first_long_title",
    "CKD_stage_last",
    "last_stage_icd",
    "last_long_title",
    "creatinine_serum_first",
    "creatinine_serum_mean",
    "creatinine_serum_last",
    "eGFR_first",
    "eGFR_mean",
    "eGFR_last",
    "gender"
]

# Reorder the dataframe columns
CKD_summary = CKD_summary[column_order]

CKD_summary

Unnamed: 0,subject_id,stage_delta,CKD_duration,charttime_first,charttime_last,CKD_stage_first,first_stage_icd,first_long_title,CKD_stage_last,last_stage_icd,last_long_title,creatinine_serum_first,creatinine_serum_mean,creatinine_serum_last,eGFR_first,eGFR_mean,eGFR_last,gender
0,10000980,1.0,1193,2188-01-03 17:41:00,2191-04-11 16:21:00,3.0,5853.0,"Chronic kidney disease, Stage III (moderate)",4.0,5854.0,"Chronic kidney disease, Stage IV (severe)",,,,,,,
1,10002013,0.0,1,2162-07-08 00:08:00,2162-07-09 18:06:00,-1.0,5859.0,"Chronic kidney disease, unspecified",-1.0,5859.0,"Chronic kidney disease, unspecified",,,,,,,
2,10002155,0.0,582,2129-08-04 12:44:00,2131-03-10 01:55:00,-1.0,5859.0,"Chronic kidney disease, unspecified",-1.0,5859.0,"Chronic kidney disease, unspecified",,,,,,,
3,10003400,0.0,134,2136-11-04 20:43:00,2137-03-19 15:45:00,3.0,5853.0,"Chronic kidney disease, Stage III (moderate)",3.0,5853.0,"Chronic kidney disease, Stage III (moderate)",,,,,,,
4,10003502,0.0,2,2161-06-29 14:34:00,2161-07-01 16:56:00,-1.0,5859.0,"Chronic kidney disease, unspecified",-1.0,5859.0,"Chronic kidney disease, unspecified",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14249,19721209,0.0,187,2151-08-09 13:34:00,2152-02-13 12:41:00,1.0,,,1.0,,,0.3,0.3,0.3,152.766486,152.766486,152.766486,F
14250,19725020,0.0,15,2141-11-03 09:00:00,2141-11-18 16:07:00,1.0,,,1.0,,,0.8,0.8,0.8,96.999366,96.999366,96.999366,M
14251,19794091,0.0,21,2158-07-06 12:30:00,2158-07-27 12:49:00,1.0,,,1.0,,,0.6,0.6,0.6,108.605457,108.605457,108.605457,F
14252,19930120,0.0,8,2170-12-16 21:45:00,2170-12-25 13:20:00,1.0,,,1.0,,,0.6,0.6,0.6,107.262924,107.262924,107.262924,F


# Save Frame to Excel Sheet

In [18]:
CKD_summary.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14254 entries, 0 to 14253
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   subject_id              14254 non-null  Int64         
 1   stage_delta             14254 non-null  float64       
 2   CKD_duration            14254 non-null  int64         
 3   charttime_first         14254 non-null  datetime64[us]
 4   charttime_last          14254 non-null  datetime64[us]
 5   CKD_stage_first         14254 non-null  float64       
 6   first_stage_icd         14012 non-null  float64       
 7   first_long_title        14012 non-null  object        
 8   CKD_stage_last          14254 non-null  float64       
 9   last_stage_icd          14012 non-null  float64       
 10  last_long_title         14012 non-null  object        
 11  creatinine_serum_first  325 non-null    float64       
 12  creatinine_serum_mean   325 non-null    float6

In [19]:
CKD_summary.describe()

Unnamed: 0,subject_id,stage_delta,CKD_duration,charttime_first,charttime_last,CKD_stage_first,first_stage_icd,CKD_stage_last,last_stage_icd,creatinine_serum_first,creatinine_serum_mean,creatinine_serum_last,eGFR_first,eGFR_mean,eGFR_last
count,14254.0,14254.0,14254.0,14254,14254,14254.0,14012.0,14254.0,14012.0,325.0,325.0,325.0,325.0,325.0,325.0
mean,15005513.715168,0.067279,327.687105,2154-03-12 21:45:03.607408,2155-02-04 03:36:14.265470,1.187456,5856.74686,1.46408,5856.636383,1.452,1.46358,1.472615,73.379111,72.750762,72.50107
min,10000980.0,-1.0,0.0,2105-10-04 17:26:00,2105-10-12 11:11:00,-1.0,5851.0,-1.0,5851.0,0.1,0.1,0.1,2.532546,2.532546,2.532546
25%,12531206.75,0.0,3.0,2134-02-26 16:16:00,2135-01-14 20:23:15,-1.0,5853.0,-1.0,5854.0,0.8,0.8,0.8,50.548064,50.181437,49.253837
50%,15017208.0,0.0,14.0,2154-04-09 19:59:00,2155-03-04 15:52:30,-1.0,5859.0,-1.0,5859.0,1.1,1.1,1.1,76.287477,73.187184,72.42734
75%,17491357.0,0.0,372.0,2174-07-26 01:35:00,2175-06-21 15:05:30,3.0,5859.0,3.0,5859.0,1.5,1.5,1.5,97.575111,97.775014,98.385001
max,19999625.0,1.0,3353.0,2206-12-10 00:10:00,2208-02-19 14:00:00,6.0,5859.0,6.0,5859.0,17.8,17.8,17.8,185.184697,185.184697,185.184697
std,2881507.673045,0.43817,590.03671,,,2.602601,2.70136,2.719351,2.649436,1.492255,1.494973,1.505039,33.245392,33.331704,33.609274


In [20]:
# save the table
CKD_summary.to_excel('mimic_iv_extract/ckd_summary.xlsx', index=False)