In [8]:
import pandas as pd
import seaborn as sns
import numpy as np
from pathlib import Path
from typing import Iterable
from datasets import SplitHandler

In [9]:
split_info_df = pd.read_csv(Path("resources", "subject_info_df.csv"))
split_info_df.head()


Unnamed: 0,SUBJECT_ID,ICUSTAY_ID,DBSOURCE,CAREUNIT,WARDID,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,ETHNICITY,GENDER,AGE
0,268,280836,carevue,MICU,52,EMERGENCY,EMERGENCY ROOM ADMIT,DEAD/EXPIRED,Medicare,,HISPANIC OR LATINO,F,66
1,269,206613,carevue,MICU,52,EMERGENCY,EMERGENCY ROOM ADMIT,HOME HEALTH CARE,Medicaid,,WHITE,M,40
2,270,220345,carevue,CCU,57,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,UNKNOWN/NOT SPECIFIED,M,80
3,272,210407,carevue,CCU,57,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Medicare,,WHITE,M,67
4,273,241507,carevue,MICU,52,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,ENGL,BLACK/AFRICAN AMERICAN,M,34


In [10]:
class SplitHandler(object):

    def __init__(self, split_info_path):
        self._split_info_df = pd.read_csv(Path(split_info_path))
        self._count_dfs = dict()
        self._others = dict()
        self._collapse_mappings = dict()

        self._colum_settings = {
            "AGE": {
                "binning": 5
            },
            "ETHNICITY": {
                "collapse": ["WHITE", "BLACK", "HISPANIC", "ASIAN", "AMERICAN INDIAN"],
                "other": 500
            },
            "GENDER": {},
            "LANGUAGE": {
                "other": 50
            },
            "INSURANCE": {},
            "DISCHARGE_LOCATION": {},
            "ADMISSION_LOCATION": {},
            "WARDID": {},
            "CAREUNIT": {},
            "DBSOURCE": {}

        }
        self._process_attributes()

    def _value_counts(self, column: str):
        return self._split_info_df.groupby([column], dropna=False, as_index=False).size()


    def _bin_counts(self, column: str, counts_df: pd.DataFrame, bin_size: int):
        bins = np.linspace(self._split_info_df[column].min(), 90, int(90 / bin_size) + 1)
        groups = counts_df.groupby(pd.cut(counts_df[column], bins))["size"]
        return groups.sum().reset_index()


    def _collapse_counts(self, column: str, counts_df: pd.DataFrame, collapse_columns: list):
        collapse_mapping = dict()
        for collapse_column in collapse_columns:
            index = counts_df[column].apply(lambda x: collapse_column in x)
            collapse_mapping[collapse_column] = counts_df[index][column].tolist()
            total_count = counts_df[index]["size"].sum()
            counts_df = counts_df[~index]
            counts_df.loc[len(counts_df)] = [collapse_column, total_count]

        return counts_df.sort_values("size", ascending=False), collapse_mapping


    def _other_counts(self, column: str, counts_df: pd.DataFrame, min_counts: int):
        others = counts_df[(counts_df["size"] <= min_counts)]
        total_count = others["size"].sum()
        counts_df = counts_df[(counts_df["size"] > min_counts)]
        if not counts_df[column].isin(["OTHER"]).any():
            counts_df.loc[len(counts_df)] = ["OTHER", total_count]
        else:
            counts_df = counts_df.set_index(column)
            counts_df.loc["OTHER", "size"] = total_count + counts_df.loc["OTHER", "size"]
            counts_df = counts_df.reset_index()
        return counts_df.sort_values("size", ascending=False), others[column].to_list()

    def _process_attributes(self):
        for attribute, setting in self._colum_settings.items():
            counts_df = self._value_counts(attribute)
            if attribute == "ETHNICITY":
                print()
            if "binning" in setting:
                counts_df = self._bin_counts(attribute, counts_df, setting["binning"])
            if "collapse" in setting:
                counts_df, self._collapse_mappings[attribute] = self._collapse_counts(
                    attribute, counts_df, setting["collapse"])
            if "other" in setting:
                counts_df, self._others[attribute] = self._other_counts(
                    attribute, counts_df, setting["other"])
            self._count_dfs[attribute] = counts_df

    def get_subjects(self, attribute, items):
        if isinstance(items, Iterable):
            
            return [subject for item in items for subject in split_info_df[split_info_df[attribute] == item]["SUBJECT_ID"]]
        elif isinstance(items, str):
            return split_info_df[split_info_df[attribute] == items]["SUBJECT_ID"]
        else:
            raise ValueError(f"Passed {type(items)} to split handler.")

In [11]:
ethnicities = {
    'WHITE': 39030,
    'BLACK': 5498,
    'UNKNOWN/NOT SPECIFIED': 4295,
    'HISPANIC': 2025,
    'ASIAN': 1913,
    'OTHER': 1756,
    'UNABLE TO OBTAIN': 775,
    'PATIENT DECLINED TO ANSWER': 530
}

In [12]:
h = SplitHandler()
h.get_subjects("ETHNICITY", "WHITE")

[]

In [None]:

test_items = "WHITE"
val_items = 

In [None]:
gender = {'F': 24616, 'M': 31214}

In [None]:
language = {
    'ENGL': 27447,
    'NAN': 24067,
    'SPAN': 1029,
    'RUSS': 753,
    'PTUN': 579,
    'OTHER': 434,
    'CANT': 392,
    'PORT': 336,
    'CAPE': 230,
    'MAND': 150,
    'HAIT': 139,
    'ITAL': 121,
    'VIET': 81,
    'GREE': 72
}


In [4]:
insurance = {'Medicare': 26524, 'Private': 21525, 'Medicaid': 5525, 'Government': 1683, 'Self Pay': 573}


In [None]:
discharge_location = {
    'HOME': 17783,
    'HOME HEALTH CARE': 13143,
    'SNF': 7195,
    'REHAB/DISTINCT PART HOSP': 6061,
    'DEAD/EXPIRED': 5755,
    'LONG TERM CARE HOSPITAL': 2273,
    'SHORT TERM HOSPITAL': 1503,
    'DISC-TRAN CANCER/CHLDRN H': 607,
    'DISCH-TRAN TO PSYCH HOSP': 413,
    'HOSPICE-HOME': 409,
    'LEFT AGAINST MEDICAL ADVI': 345,
    'HOSPICE-MEDICAL FACILITY': 159,
    'OTHER FACILITY': 62,
    'HOME WITH HOME IV PROVIDR': 62,
    'ICF': 48,
    'DISC-TRAN TO FEDERAL HC': 11,
    'SNF-MEDICAID ONLY CERTIF': 1
}


In [None]:
admission_location = {
    'EMERGENCY ROOM ADMIT': 21591,
    'PHYS REFERRAL/NORMAL DELI': 14067,
    'CLINIC REFERRAL/PREMATURE': 11452,
    'TRANSFER FROM HOSP/EXTRAM': 8068,
    'TRANSFER FROM SKILLED NUR': 262,
    '** INFO NOT AVAILABLE **': 207,
    'HMO REFERRAL/SICK': 103,
    'TRANSFER FROM OTHER HEALT': 76,
    'TRSF WITHIN THIS FACILITY': 4
}


In [None]:
wardid = {
    52: 7784,
    56: 7611,
    14: 6796,
    23: 5701,
    57: 5065,
    33: 4913,
    7: 4867,
    15: 4801,
    50: 4026,
    12: 3567,
    26: 418,
    16: 121,
    38: 64,
    22: 38,
    37: 34,
    47: 24
}


In [None]:
careunit = {'MICU': 19397, 'SICU': 8038, 'NICU': 8029, 'CSRU': 7923, 'CCU': 6710, 'TSICU': 5733}

In [None]:
db_source = {'carevue': 34549, 'metavision': 21182, 'both': 99}