In [None]:
import sys
from os import getcwd
from os.path import join
import pandas as pd
sys.path.insert(0, join(getcwd(), "../module_code"))

from data.load import load_data, load_outcomes
from cli_utils import load_cli_args, init_cli_args

sys.argv = [sys.argv[0]]
load_cli_args("../options.yml")
args = init_cli_args()

# Outcomes

In [None]:
outcomes = load_outcomes(args.ucla_crrt_data_dir, ["IP_PATIENT_ID", "Start Date"])

In [None]:
outcomes.columns

In [None]:
granular_outcome = outcomes[["Recov. renal funct.", "Transitioned to HD", "Comfort Care", "Expired "]].idxmax(axis=1)
binary_outcome = outcomes["recommend_crrt"]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
counts = granular_outcome.value_counts()
# https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.pie.html
plt.pie(
    counts,
    # autopct="%1.1f%%",
    labels=counts.index,
    autopct=lambda p : '{:.2f}%  ({:,.0f})'.format(p,p * sum(counts)/100),
    # https://seaborn.pydata.org/tutorial/color_palettes.html
    colors=[
        sns.color_palette("mako")[-1],
        sns.color_palette("rocket")[-1],
        sns.color_palette("rocket")[-3],
        sns.color_palette("mako")[-3],
    ]
)
plt.show()

counts = binary_outcome.value_counts()
# plt.pie(counts,  
plt.pie(
    counts,
    # autopct="%1.1f%%",
    autopct=lambda p : '{:.2f}%  ({:,.0f})'.format(p,p * sum(counts)/100),
    labels=["Recomment CRRT", "Do Not Recommend CRRT", ],
    colors=[
            sns.color_palette("mako")[-3],
            sns.color_palette("rocket")[-3],
    ],
    startangle=300
)
plt.show()

In [None]:
import numpy as np
# Ref: https://stackoverflow.com/a/68107816/1888794
fig, ax = plt.subplots()
bin_names = ["Recommend CRRT", "Do Not\nRecommend CRRT"]
granular_names = ['Recov. renal funct.', 'Transitioned to HD', 'Comfort Care', 'Expired ']
counts = granular_outcome.value_counts()[granular_names]
bin_counts = binary_outcome.value_counts()

cmap = plt.colormaps["tab20c"]
outer_colors = cmap(np.arange(3)*4)
inner_colors = cmap([1, 2, 5, 6, 9, 10])

width=0.6
ax.bar(["UCLA: CRRT"], bin_counts[1], width=-width, label=bin_names[0], align="edge", color=outer_colors[0])  # the names are reverse
ax.bar(["UCLA: CRRT"], bin_counts[0], width=-width, bottom=bin_counts[1], label=bin_names[1], align="edge", color=outer_colors[1])

bottom = 0
for i in range(len(counts)):
    if i>0:
        bottom += counts[i-1]
    ax.bar(["UCLA: CRRT"], counts[i], width=width, bottom=bottom, label=granular_names[i], align="edge", color=inner_colors[i])  # the names are reverse


ax.bar(["UCLA: Control"], 3623, width=width, label="Do Not\nRecommend CRRT", color=outer_colors[1])  # the names are reverse

# this is hacky af but whatever
i = 0
for bars in ax.containers:
    padding = -40 if bars.get_label() == "Do Not\nRecommend CRRT" else -30
    total = sum(bin_counts) if i <= 5 else 3623
    ax.bar_label(bars, padding=padding, labels=[f'{x} ({x/total:.1%})\n{bars.get_label()}' for x in bars.datavalues])
    i += 1    

ax.set_ylabel("Count of Samples")
ax.set_title("Outcome Breakdown")

plt.show()
plt.savefig("outcome_breakdown.eps", format="eps")

In [None]:
import numpy as np
size = 0.5
fig, ax = plt.subplots()

# ensure correct order
counts = granular_outcome.value_counts()[['Recov. renal funct.', 'Transitioned to HD', 'Comfort Care', 'Expired ']]
bin_counts = binary_outcome.value_counts()
cmap = plt.colormaps["tab20c"]
outer_colors = cmap(np.arange(3)*4)
inner_colors = cmap([1, 2, 5, 6, 9, 10])

bin_outcomes = ["Recommend CRRT", "Do Not Recommend CRRT"]
outerwedges, outertexts, outerautotexts = ax.pie(
    bin_counts,
    radius=1,
    autopct=lambda p : '{:.2f}%\n({:,.0f})'.format(p,p * sum(bin_counts)/100),
    labels=bin_outcomes,
    wedgeprops=dict(width=size, edgecolor='w'),
    colors=outer_colors,
)

innerwedges, innertexts, innerautotexts = ax.pie(
    counts,
    # autopct="%1.1f%%",
    # labels=counts.index,
    radius=1-size,
    # labeldistance=0.9,
    autopct=lambda p : '{:.2f}%\n({:,.0f})'.format(p,p * sum(counts)/100),
    wedgeprops=dict(width=size, edgecolor='w'),
    colors=inner_colors,
)

# bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
# kw = dict(arrowprops=dict(arrowstyle="-"), bbox=bbox_props, zorder=0, va="center")
kw = dict(arrowprops=dict(arrowstyle="-"), zorder=0, va="center")
for i, p in enumerate(innerwedges):
    ang = (p.theta2 - p.theta1)/2. + p.theta1
    y = np.sin(np.deg2rad(ang))
    x = np.cos(np.deg2rad(ang))
    horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
    connectionstyle = f"angle,angleA=0,angleB={ang}"
    kw["arrowprops"].update({"connectionstyle": connectionstyle})
    ax.annotate(counts.index[i], xy=(x, y), xytext=(1.35*np.sign(x), 1.4*y), horizontalalignment=horizontalalignment, **kw)

ax.set(aspect='equal')
ax.legend(innerwedges, counts.index, loc=(-0.2, 0.1))
# plt.setp(innerautotexts, size=8, weight="bold")
plt.show()

# Dataset

In [None]:
ucla_crrt_df = load_data(args, "ucla_crrt")
ucla_control_df = load_data(args, "ucla_control")
cedars_crrt_df = load_data(args, "cedars_crrt")

In [None]:
ucla_control_df

In [None]:
ucla_crrt_df

# Patient Type

In [None]:
import upsetplot
from matplotlib import cm
import matplotlib.pyplot as plt
import pandas as pd
def plot_upsetplot(preprocessed_df: pd.DataFrame, cohort: str, colnames: list[str]):
    fig = plt.figure(figsize=(10, 30))
    indicator_cols = preprocessed_df[colnames].astype(bool)
    data = pd.concat([indicator_cols, preprocessed_df["recommend_crrt"]],axis=1).replace({0: "Do not recommend CRRT", 1: "Recommend CRRT"})
    myplot = upsetplot.UpSet(data.set_index(colnames), intersection_plot_elements=0, show_counts=True, show_percentages=True, element_size=50)
    myplot.add_stacked_bars(by="recommend_crrt", colors=cm.Set2, elements=10)
    myplot.plot()
    plt.title(f"{cohort} Patient Type Breakdown")
    plt.show()

In [None]:
race_cols = ucla_crrt_df.columns[ucla_crrt_df.columns.str.contains("RACE_")].to_list()
race_cols

In [None]:
disease_groups =  [f"{name}_pt_indicator" for name in ["heart", "liver", "infection"]]
for columns in [disease_groups, race_cols]:
    plot_upsetplot(ucla_crrt_df, "UCLA CRRT", columns)
    plot_upsetplot(ucla_control_df, "UCLA Control", columns)

## Numerical Breakdown

In [None]:
myplot.total

In [None]:
print(ucla_crrt_df["recommend_crrt"].value_counts())
print(ucla_crrt_df["recommend_crrt"].value_counts(normalize=True)*100)

In [None]:
from IPython.display import display
for table in [myplot.totals, myplot.intersections]:
    display(pd.concat([table.rename("N"), (table/myplot.total*100).rename("%")], axis=1), )

In [None]:
df = myplot._df.set_index("recommend_crrt", append=True, drop=False)
gb = df.groupby(level=list(range(df.index.nlevels)), sort=True)
counts = gb.size()
# percents = counts / df.groupby(level=list(range(df.index.nlevels - 2)), sort=True) * 100
percents = counts / myplot.intersections * 100
pd.concat([counts.rename("N"), percents.rename("%")], axis=1)

In [None]:
ucla_crrt_stats = ucla_crrt_df.describe()
ucla_control_stats = ucla_control_df.describe()
cedars_crrt_stats = cedars_crrt_df.describe()

In [None]:
def compare_vars(var, d1, d2):
    d1[var].dropna().plot(kind="hist", title=var)
    plt.show()
    d2[var].dropna().plot(kind="hist", title=var)
    plt.show()
compare_vars("C-REACTIVE PROTEIN_mean", ucla_crrt_df, cedars_crrt_df)
compare_vars("Weight_mean", ucla_crrt_df, cedars_crrt_df)

In [None]:
def compare_stats(s1, s2):
    diff = s1 - s2
    diff = diff.dropna(how="all", axis=1)
    diff.loc["mean"][abs(diff.loc["mean"]) > 10].plot(style=".", rot=90)
    plt.show()
    # display(diff)
    display(diff.loc["mean"][abs(diff.loc["mean"]) > 10])
    return diff
compare_stats(ucla_crrt_stats, cedars_crrt_stats)
compare_stats(ucla_crrt_stats, ucla_control_stats)

# MISSING Data

In [None]:
tmp_df = ucla_crrt_df.drop(ucla_crrt_df.columns[ucla_crrt_df.columns.str.contains("^Unnamed")], axis=1)
def print_missing_info(filter = None):
    if filter is not None:
        total_notmissing = sum(~tmp_df[filter].isna().any(axis=1))
    else:
        total_notmissing = sum(~tmp_df.isna().any(axis=1))
    print(f"Number of patients not missing any data: {total_notmissing}, ({total_notmissing/tmp_df.shape[0] * 100}%)")
print("All")
print_missing_info()

In [None]:
(tmp_df.drop(["infection_pt_indicator", "liver_pt_indicator", "heart_pt_indicator", "recommend_crrt", "CRRT Year"], axis=1).isna().mean() * 100).sort_values()

# column_mask = pd.read_pickle(join("/home/davina/Private/repos/CRRT", "local_data","data_transform.pkl")).__self__.named_steps[
#                 "feature-selection"
#             ].get_support()
column_mask = pd.read_pickle(join("/home/davina/Private/repos/CRRT", "selected_columns.pkl"))

# tmp_df[tmp_df.drop("recommend_crrt",axis=1).columns[column_mask]].drop(["infection_pt_indicator", "liver_pt_indicator", "heart_pt_indicator"], axis=1)

amount_missing = tmp_df.drop("recommend_crrt",axis=1)[column_mask].isna().mean().sort_values()
amount_missing.name = "% Missing"
amount_missing.index.name = "Variables"
(amount_missing*100).to_csv(join("/home/davina/Private/repos/CRRT", "variables_and_amount_missing.csv"))
amount_missing

In [None]:
tmp_df[tmp_df.columns[tmp_df.columns.str.contains("_na")]].mean()

In [None]:
import pandas as pd
percent_missing_per_feature = tmp_df.isna().sum() / tmp_df.shape[0] * 100
percents = pd.Series([10, 25, 50], name=">= Percent Missing")
counts = pd.Series([(percent_missing_per_feature >= (p/100)).sum() for p in percents])

pd.DataFrame(counts, index=percents)