In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd

df = pd.read_csv("./data/IBD_readm_data.csv",index_col=0)
df.head()

In [None]:
df.shape

In [None]:
df_copy = df.copy()

In [None]:
# Recoding the ibd_readm outcome variable
readm_set = set()
for index, row in df_copy[['nrd_visitlink','ibd_readm']].iterrows():
    if row[1] == 1:
        readm_set.add(row[0])
        
readm_list = list(readm_set)

df_copy.loc[df_copy.nrd_visitlink.isin(readm_list), 'ibd_readm'] = 1

In [None]:
df_copy['ibd_readm'] = df_copy['ibd_readm'].fillna(0)
# Filter the data so that only patients with an index hospitalizations are in a DataFrame
readm_data_df = df_copy.loc[df_copy["ibd_index_hosp"] == 1, :]

# Check to ensure the filter worked correctly
readm_data_df["ibd_index_hosp"].count()

In [None]:
readm_data_df.dropna(axis='columns')

In [None]:
fig, ax = plt.subplots(figsize=(15,10))
nan_proportion = readm_data_df.isna().sum().sort_values(ascending=False)/readm_data_df.shape[0]
sns.histplot(nan_proportion)
plt.show()

In [None]:
remove_these_columns = list(nan_proportion.loc[nan_proportion>0.25].index) + ["dx{}".format(i) for i in range(1,7)] + ["_merge","pr10","pr12","pr14","pr15","nrd_visitlink","died","death_time","survtime"] + ["ibd_tot_readm","ibdtotadm_alive_fu" , "was_readmitted", "total_adm_num","ibdtotaladm_from_index"]

#pr's have too many dots, which become nans
#nrd_visitlink looks like an internal ID of some sort
#merge has no variance, it's all the same value
#died, death_time, survtime is optional, probably not relevent since dead people can't be readmitted
#"ibd_tot_readm","ibdtotadm_alive_fu" , "was_readmitted", "total_adm_num" , "ibdtotaladm_from_index" are redundant with target column

# Grab the features
data = readm_data_df[[c for c in readm_data_df.columns if c not in remove_these_columns]]
feature_names = data.columns
data.head(25)

def frailty_risk_cat_to_integer(x):
    risk_num_dict = {"high":3,"medium":2,"low":1}
    return risk_num_dict[x]
data["frailty_risk_category"] = data["frailty_risk_category"].apply(lambda x:frailty_risk_cat_to_integer(x))

data.dtypes.loc[data.dtypes=="object"]

data_no_nas = data.dropna()
target_column = "ibd_readm"
data_no_nas_no_target = data_no_nas[[c for c in data_no_nas.columns if c != target_column]]
target = data_no_nas[target_column].astype('int64')
data_no_nas_no_target.shape

In [None]:
data_no_nas.to_csv("./data/IBD_data_cleaned.csv")

In [None]:
data_no_nas.shape