In [16]:
import os
os.getcwd()
os.chdir('../../../')
os.getcwd() # TODO: make this nicer

'/Users/carlbuchholz'

In [17]:
import ehrapy as ep
import pandas as pd
import numpy as np
import tableone

## Case Study Data

This tutorial explores the MIMIC-II IAC dataset. It was created for the purpose of a case study in the book: Secondary Analysis of Electronic Health Records, published by Springer in 2016. In particular, the dataset was used throughout Chapter 16 (Data Analysis) by Raffa J. et al. to investigate the effectiveness of indwelling arterial catheters in hemodynamically stable patients with respiratory failure for mortality outcomes. The dataset is derived from MIMIC-II, the publicly-accessible critical care database. It contains summary clinical data and  outcomes for 1,776 patients.

Reference: 

[1] Critical Data, M.I.T., 2016. Secondary analysis of electronic health records (p. 427). Springer Nature. (https://link.springer.com/book/10.1007/978-3-319-43742-2)

[2] https://github.com/MIT-LCP/critical-data-book/tree/master/part_ii/chapter_16/jupyter

[3] https://stackoverflow.com/questions/27328623/anova-test-for-glm-in-python/60769343#60769343

In [18]:
adata = ep.dt.mimic_2(encoded=False)

In [19]:
adata.var.index

Index(['aline_flg', 'icu_los_day', 'hospital_los_day', 'age', 'gender_num',
       'weight_first', 'bmi', 'sapsi_first', 'sofa_first', 'service_unit',
       'service_num', 'day_icu_intime', 'day_icu_intime_num',
       'hour_icu_intime', 'hosp_exp_flg', 'icu_exp_flg', 'day_28_flg',
       'mort_day_censored', 'censor_flg', 'sepsis_flg', 'chf_flg', 'afib_flg',
       'renal_flg', 'liver_flg', 'copd_flg', 'cad_flg', 'stroke_flg',
       'mal_flg', 'resp_flg', 'map_1st', 'hr_1st', 'temp_1st', 'spo2_1st',
       'abg_count', 'wbc_first', 'hgb_first', 'platelet_first', 'sodium_first',
       'potassium_first', 'tco2_first', 'chloride_first', 'bun_first',
       'creatinine_first', 'po2_first', 'pco2_first', 'iv_day_1'],
      dtype='object')

## Case Study and Summary

In [20]:
# adata = ep.dt.mimic_2(encoded=True)
ep.ad.infer_feature_types(adata)
adata.var


[93m![0m Features 'aline_flg', 'gender_num', 'service_num', 'day_icu_intime_num', 'hour_icu_intime', 'hosp_exp_flg', 'icu_exp_flg', 'day_28_flg', 'censor_flg', 'sepsis_flg', 'chf_flg', 'afib_flg', 'renal_flg', 'liver_flg', 'copd_flg', 'cad_flg', 'stroke_flg', 'mal_flg', 'resp_flg' were detected as categorical features stored numerically.Please verify and correct using `ep.ad.replace_feature_types` if necessary.


Unnamed: 0,feature_type
aline_flg,categorical
icu_los_day,numeric
hospital_los_day,numeric
age,numeric
gender_num,categorical
weight_first,numeric
bmi,numeric
sapsi_first,numeric
sofa_first,numeric
service_unit,categorical


We are now ready to generate a summary of the patient characteristics in our
study. We will be using the [tableone](https://github.com/tompollard/tableone) package to generate a summary table of the
patient characteristics. 

In [21]:
continuous_vars = adata.var.index[adata.var['feature_type'] == 'numeric']
categorical_vars = adata.var.index[adata.var['feature_type'] == 'categorical']

continuous_vars = continuous_vars.tolist()
categorical_vars = categorical_vars.tolist()
all_vars = continuous_vars + categorical_vars
table = tableone.TableOne(adata.to_df(), all_vars, categorical=categorical_vars, continuous=continuous_vars, groupby="day_28_flg")
print(table.tabulate(tablefmt="fancy_grid"))

╒══════════════════════════════╤═══════════╤═══════════╤═════════════════╤═════════════════╤═════════════════╕
│                              │           │ Missing   │ Overall         │ 0               │ 1               │
╞══════════════════════════════╪═══════════╪═══════════╪═════════════════╪═════════════════╪═════════════════╡
│ n                            │           │           │ 1776            │ 1493            │ 283             │
├──────────────────────────────┼───────────┼───────────┼─────────────────┼─────────────────┼─────────────────┤
│ icu_los_day, mean (SD)       │           │ 0         │ 3.3 (3.4)       │ 3.2 (3.2)       │ 4.0 (4.0)       │
├──────────────────────────────┼───────────┼───────────┼─────────────────┼─────────────────┼─────────────────┤
│ hospital_los_day, mean (SD)  │           │ 0         │ 8.1 (8.2)       │ 8.4 (8.4)       │ 6.4 (6.4)       │
├──────────────────────────────┼───────────┼───────────┼─────────────────┼─────────────────┼─────────────────┤
│

In [22]:
import numpy as np
import pandas as pd

def cut2(x, bins, min_obs=None, right=True, include_lowest=True):
    """
    Python equivalent of the `cut2` function from the Hmisc package in R.

    Parameters:
    x (array-like): The input data to be binned.
    bins (int or array-like): Number of bins or bin edges.
    min_obs (int, optional): Minimum number of observations in each bin.
    right (bool, optional): Whether the bins include the rightmost edge.
    include_lowest (bool, optional): Whether to include the lowest edge.

    Returns:
    pd.Series: A categorical series with the binned values.
    """
    if isinstance(bins, int):
        # If bins is an integer, create quantile-based bins
        percentiles = np.linspace(0, 100, bins + 1)
        bin_edges = np.percentile(x, percentiles)
    else:
        # If bins is an array-like, use it as bin edges
        bin_edges = np.asarray(bins)
    
    # Ensure min_obs is satisfied
    if min_obs is not None:
        while True:
            bin_counts = pd.cut(x, bins=bin_edges, right=right, include_lowest=include_lowest).value_counts()
            if bin_counts.min() >= min_obs:
                break
            # Adjust bin edges to increase the number of observations in the smallest bin
            bin_edges = np.percentile(x, np.linspace(0, 100, len(bin_edges)))
    
    # Cut the data into bins
    binned = pd.cut(x, bins=bin_edges, right=right, include_lowest=include_lowest)
    return binned

In [24]:
# adata.obs['cut2_sapsi_first'] = cut2(adata.os['sapsi_first'], bins=5)
adata[:, 'sapsi_first']
cut2(adata[:, 'sapsi_first'], bins=5)


ValueError: setting an array element with a sequence. The requested array would exceed the maximum number of dimension of 32.

In [11]:
dependant_var = 'day_28_flg' 
# filter out the dependent variable
# TODO: look into how string types can be handled: "service_unit"
independent_vars = ["aline_flg", "age", "gender_num", "chf_flg", "afib_flg", "renal_flg", "liver_flg", "copd_flg", "cad_flg", "mal_flg", "resp_flg"]
formula = f"{dependant_var} ~ {' + '.join(independent_vars)}"
var_names = independent_vars + [dependant_var]
co2_lm = ep.tl.ols(adata, var_names , formula, missing="drop")
co2_lm_result = co2_lm.fit()
co2_lm_result.summary()

0,1,2,3
Dep. Variable:,day_28_flg,R-squared:,0.175
Model:,OLS,Adj. R-squared:,0.17
Method:,Least Squares,F-statistic:,34.0
Date:,"Mon, 10 Mar 2025",Prob (F-statistic):,3.13e-66
Time:,09:22:02,Log-Likelihood:,-564.19
No. Observations:,1775,AIC:,1152.0
Df Residuals:,1763,BIC:,1218.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2025,0.028,-7.275,0.000,-0.257,-0.148
aline_flg,0.0136,0.016,0.843,0.399,-0.018,0.045
age,0.0059,0.000,13.419,0.000,0.005,0.007
gender_num,-0.0015,0.016,-0.090,0.928,-0.034,0.031
chf_flg,0.0180,0.028,0.642,0.521,-0.037,0.073
afib_flg,0.1468,0.027,5.385,0.000,0.093,0.200
renal_flg,-0.0971,0.046,-2.098,0.036,-0.188,-0.006
liver_flg,0.0244,0.035,0.703,0.482,-0.044,0.093
copd_flg,-0.0305,0.030,-1.031,0.302,-0.089,0.028

0,1,2,3
Omnibus:,353.92,Durbin-Watson:,2.04
Prob(Omnibus):,0.0,Jarque-Bera (JB):,592.986
Skew:,1.335,Prob(JB):,1.72e-129
Kurtosis:,3.945,Cond. No.,347.0


In [12]:
import pandas as pd
import statsmodels.api as sm
import ehrapy as ep

def drop1(adata, dependant_var, independent_vars, missing="drop"):
    """
    Python implementation of R's drop1 function using ehrapy.tl.ols.

    Args:
        adata: The AnnData object for the OLS model.
        formula: The formula specifying the full model.
        var_names: A list of var names indicating which columns are for the OLS model.
        missing: Available options are 'none', 'drop', and 'raise'.
                 If 'none', no nan checking is done. If 'drop', any observations with nans are dropped.
                 If 'raise', an error is raised.

    Returns:
        pd.DataFrame: A DataFrame containing AIC, BIC, and LRT p-values for each model with one term dropped.
    """
    # Fit the full model
    formula = f"{dependant_var} ~ {' + '.join(independent_vars)}"
    var_names = independent_vars + [dependant_var]
    full_model = ep.tl.ols(adata, var_names=var_names, formula=formula, missing=missing)
    full_model_result = full_model.fit()

    # Initialize a list to store results
    results = []

    # Drop one term at a time and compare models
    for term in independent_vars:
        reduced_formula = f"{formula.split('~')[0].strip()} ~ {' + '.join([t for t in independent_vars if t != term])}"
        reduced_model = ep.tl.ols(adata, var_names=var_names, formula=reduced_formula, missing=missing)
        reduced_model_result = reduced_model.fit()
        
        # Calculate AIC, BIC, and LRT p-value
        aic = reduced_model_result.aic
        lrt_stat, lrt_pvalue, _ = full_model_result.compare_lr_test(reduced_model_result)
        deviance = reduced_model_result.ssr
        
        # Append results
        results.append({
            "Dropped Term": term,
            "Deviance": deviance,
            "AIC": aic,
            "LRT Statistic": lrt_stat,
            "LRT p-value": lrt_pvalue,
        })

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)
    return results_df 


In [13]:
drop1(adata, dependant_var, independent_vars, missing="drop")

Unnamed: 0,Dropped Term,Deviance,AIC,LRT Statistic,LRT p-value
0,aline_flg,196.327461,1151.086991,0.715044,0.3977744
1,age,216.294095,1323.004706,172.632758,1.9686610000000002e-39
2,gender_num,196.250472,1150.026232,-0.345716,1.0
3,chf_flg,196.294289,1150.787061,0.415113,0.5193855
4,afib_flg,199.476195,1179.328879,28.956931,7.400553e-08
5,renal_flg,196.738154,1154.796198,4.42425,0.03543167
6,liver_flg,196.30336,1150.869084,0.497137,0.480761
7,copd_flg,196.366816,1151.442765,1.070818,0.3007607
8,cad_flg,196.282117,1150.676995,0.305048,0.5807352
9,mal_flg,196.273297,1150.597225,0.225277,0.6350479
