In [None]:
import sys
from os import getcwd
from os.path import join
import pandas as pd
sys.path.insert(0, join(getcwd(), "../module_code"))

from data.load import load_data, load_outcomes
from cli_utils import load_cli_args, init_cli_args

sys.argv = [sys.argv[0]]
load_cli_args("../options.yml")
args = init_cli_args()

# Outcomes

In [None]:
outcomes = load_outcomes(args.ucla_crrt_data_dir, ["IP_PATIENT_ID", "Start Date"])

In [None]:
granular_outcome = outcomes[["Recov. renal funct.", "Transitioned to HD", "Comfort Care", "Expired "]].idxmax(axis=1)
binary_outcome = outcomes["recommend_crrt"]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
counts = granular_outcome.value_counts()
# https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.pie.html
plt.pie(counts, labels=counts.index, autopct="%1.1f%%",
    # https://seaborn.pydata.org/tutorial/color_palettes.html
    colors=[
        sns.color_palette("mako")[-1],
        sns.color_palette("rocket")[-1],
        sns.color_palette("rocket")[-3],
        sns.color_palette("mako")[-3],
    ]
)
plt.show()

counts = binary_outcome.value_counts()
plt.pie(counts,  autopct="%1.1f%%",
    labels=["Recomment CRRT", "Do Not Recommend CRRT", ],
    colors=[
            sns.color_palette("mako")[-3],
            sns.color_palette("rocket")[-3],
    ],
    startangle=300
)
plt.show()

# Dataset

In [None]:
preprocessed_df = load_data(args, "ucla_crrt")

# Patient Type
- heart
- liver
- infection

In [None]:
myplot.subset_legend

In [None]:
import upsetplot
from matplotlib import cm
import matplotlib.pyplot as plt
import pandas as pd
groups = ["heart", "liver", "infection"]
fig = plt.figure(figsize=(10, 30))
colnames =  [f"{name}_pt_indicator" for name in groups]
indicator_cols = preprocessed_df[colnames].astype(bool)
data = pd.concat([indicator_cols, preprocessed_df["recommend_crrt"]],axis=1).replace({0: "Do not recommend CRRT", 1: "Recommend CRRT"})
myplot = upsetplot.UpSet(data.set_index(colnames), intersection_plot_elements=0, show_counts=True, show_percentages=True, element_size=50)
myplot.add_stacked_bars(by="recommend_crrt", colors=cm.Set2, elements=10)
myplot.plot()

## Numerical Breakdown

In [None]:
myplot.total

In [None]:
print(preprocessed_df["recommend_crrt"].value_counts())
print(preprocessed_df["recommend_crrt"].value_counts(normalize=True)*100)

In [None]:
from IPython.display import display
for table in [myplot.totals, myplot.intersections]:
    display(pd.concat([table.rename("N"), (table/myplot.total*100).rename("%")], axis=1), )

In [None]:
df = myplot._df.set_index("recommend_crrt", append=True, drop=False)
gb = df.groupby(level=list(range(df.index.nlevels)), sort=True)
counts = gb.size()
# percents = counts / df.groupby(level=list(range(df.index.nlevels - 2)), sort=True) * 100
percents = counts / myplot.intersections * 100
pd.concat([counts.rename("N"), percents.rename("%")], axis=1)

# MISSING Data

In [None]:
tmp_df = preprocessed_df.drop(preprocessed_df.columns[preprocessed_df.columns.str.contains("^Unnamed")], axis=1)
def print_missing_info(filter = None):
    if filter is not None:
        total_notmissing = sum(~tmp_df[filter].isna().any(axis=1))
    else:
        total_notmissing = sum(~tmp_df.isna().any(axis=1))
    print(f"Number of patients not missing any data: {total_notmissing}, ({total_notmissing/tmp_df.shape[0] * 100}%)")
print("All")
print_missing_info()

In [None]:
(tmp_df.drop(["infection_pt_indicator", "liver_pt_indicator", "heart_pt_indicator", "recommend_crrt", "CRRT Year"], axis=1).isna().mean() * 100).sort_values()

# column_mask = pd.read_pickle(join("/home/davina/Private/repos/CRRT", "local_data","data_transform.pkl")).__self__.named_steps[
#                 "feature-selection"
#             ].get_support()
column_mask = pd.read_pickle(join("/home/davina/Private/repos/CRRT", "selected_columns.pkl"))

# tmp_df[tmp_df.drop("recommend_crrt",axis=1).columns[column_mask]].drop(["infection_pt_indicator", "liver_pt_indicator", "heart_pt_indicator"], axis=1)

amount_missing = tmp_df.drop("recommend_crrt",axis=1)[column_mask].isna().mean().sort_values()
amount_missing.name = "% Missing"
amount_missing.index.name = "Variables"
(amount_missing*100).to_csv(join("/home/davina/Private/repos/CRRT", "variables_and_amount_missing.csv"))
amount_missing

In [None]:
tmp_df[tmp_df.columns[tmp_df.columns.str.contains("_na")]].mean()

In [None]:
import pandas as pd
percent_missing_per_feature = tmp_df.isna().sum() / tmp_df.shape[0] * 100
percents = pd.Series([10, 25, 50], name=">= Percent Missing")
counts = pd.Series([(percent_missing_per_feature >= (p/100)).sum() for p in percents])

pd.DataFrame(counts, index=percents)