In [1]:
import sys
from os import getcwd
from os.path import join
import pandas as pd
sys.path.insert(0, join(getcwd(), "../module_code"))

from data.load import load_data, load_outcomes
from cli_utils import load_cli_args, init_cli_args

sys.argv = [sys.argv[0]]
load_cli_args("../options.yml")
args = init_cli_args()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pandas import read_csv
from pandas.errors import ParserError

def read_file(file: str):
    try:
        # Try normally reading the csv with pandas, if it fails the formatting is strange
        df = read_csv(file)
    except ParserError as e:
        df = read_csv(file, on_bad_lines="skip")
    except Exception:
        df = read_csv(file, encoding="cp1252")
    return df


# Providers

In [10]:
provider = "Providers.txt"
crrt_df = read_file(f"{args.ucla_crrt_data_dir}/{provider}")
ctrl_df = read_file(f"{args.ucla_control_data_dir}/{provider}")

In [11]:
print(crrt_df.columns)
print(ctrl_df.columns)

Index(['IP_PROVIDER_ID', 'PROVIDER_TYPE'], dtype='object')
Index(['ip_provider_id', 'provider_id', 'provider_name', 'provider_sex',
       'provider_type', 'primary_specialty', 'ucla_employee_flag'],
      dtype='object')


# Demographics

In [3]:
demo = "Patient_Demographics.txt"
crrt_df = read_file(f"{args.ucla_crrt_data_dir}/{demo}")
ctrl_df = read_file(f"{args.ucla_control_data_dir}/{demo}")

In [6]:
crrt_df.columns

Index(['IP_PATIENT_ID', 'AGE', 'GENDER', 'RACE', 'ETHNICITY', 'VITAL_STATUS',
       'PCP_IP_PROVIDER_ID'],
      dtype='object')

In [7]:
ctrl_df.columns

Index(['ip_patient_id', 'age', 'sex', 'race', 'ethnicity', 'vital_status',
       'ip_current_pcp_id'],
      dtype='object')

In [8]:
crrt_df

Unnamed: 0,IP_PATIENT_ID,AGE,GENDER,RACE,ETHNICITY,VITAL_STATUS,PCP_IP_PROVIDER_ID
0,190EEEB1D0D98BF512DCD2B38F698517,67.0,Male,White or Caucasian,Not Hispanic or Latino,Known Deceased,
1,FB9E13C1092FC991E3B00FCE85989EF0,80.0,Male,Multiple Races,Not Hispanic or Latino,Not Known Deceased,
2,A2631FE4BF2371ED63288EB31D6548DC,55.0,Male,White or Caucasian,Hispanic or Latino,Known Deceased,
3,114407DC6649633A97CCC48FD55AB2AF,72.0,Female,White or Caucasian,Not Hispanic or Latino,Known Deceased,B1C0216BF2E41A7F33CB0DA86B803217
4,1CCBD8BC4942F9F047844B7FA81B0A8D,54.0,Female,White or Caucasian,Not Hispanic or Latino,Known Deceased,8B5A4CF933028CA7B0E43138455821CF
...,...,...,...,...,...,...,...
3592,D03D66FBBB29439AA2A673E3A31E5370,83.0,Male,White or Caucasian,Hispanic or Latino,Not Known Deceased,
3593,CE2973055A83792F17A6953025E29287,44.0,Male,White or Caucasian,Not Hispanic or Latino,Not Known Deceased,B2B4D00F9F85108A48BCF3B7F185A5E1
3594,789738AF42B81299F1F23487CD994D34,69.0,Male,Other,Not Hispanic or Latino,Not Known Deceased,048B8CE66DF0780BE8E023007E5950AA
3595,05CFF6853353C77312C1E72FF32A5DAB,79.0,Female,White or Caucasian,Not Hispanic or Latino,Known Deceased,FC87B0D1C0DC30D9F85EB44615B8C0DB


In [9]:
ctrl_df

Unnamed: 0,ip_patient_id,age,sex,race,ethnicity,vital_status,ip_current_pcp_id
0,00000064C94859EB973942C862550ABA,63,Male,White or Caucasian,Not Hispanic or Latino,Known Deceased,6AF0A1D73390C5FCE938515E33BDBBFF
1,000001070C909F1D89C349A52370F3C3,46,Female,White or Caucasian,Not Hispanic or Latino,Not Known Deceased,4D476A79BAEB708DF1F5A1BF78EF3C3A
2,000003BE051B025E04F32DBF8FF7334E,73,Female,Other,Hispanic or Latino,Not Known Deceased,2DB25CFD109B9AB9FCFEFBC06375B682
3,000005F8F2265713B8C18DE5C273D03B,45,Female,Black or African American,Not Hispanic or Latino,Not Known Deceased,28E01FBFB12BAD44D001B60F2DEC8A77
4,000008E26F8F14E5099662DDB778C698,30,Male,Asian,Not Hispanic or Latino,Known Deceased,
...,...,...,...,...,...,...,...
4723,C91F02E25A3EBE4CD96F003668587FF9,77,Female,Black or African American,Hispanic or Latino,Not Known Deceased,
4724,D2105B0EC7E6A4D4FE4370522C72B276,67,Female,White or Caucasian,Hispanic or Latino,Not Known Deceased,
4725,F2B64924154BA4FA5D1123186135A1CE,71,Male,Black or African American,Not Hispanic or Latino,Not Known Deceased,
4726,FE50F3FA3FDA9FC89C48C965D4743760,67,Female,Other,Hispanic or Latino,Known Deceased,


# Vitals
The vital sign names seems to be mismatching between the crrt patients and controls at UCLA.

Controls have `WEIGHT/SCALE` which is messing with splitting `SBP/DBP`.

In [None]:
vitals = "Flowsheet_Vitals.txt"
crrt_df = read_file(f"{args.ucla_crrt_data_dir}/{vitals}")
ctrl_df = read_file(f"{args.ucla_control_data_dir}/{vitals}")

In [4]:
crrt_df["VITAL_SIGN_TYPE"].unique()

array(['SpO2', 'O2 Device', 'Temp', 'Height', 'BMI (Calculated)',
       'Weight', 'Resp', 'BP', 'Pulse'], dtype=object)

In [9]:
ctrl_df["vital_sign_type"].unique()

array(['PULSE', 'RESPIRATIONS', 'PULSE OXIMETRY', 'TEMPERATURE',
       'BLOOD PRESSURE', 'R BMI', 'WEIGHT/SCALE', 'HEIGHT'], dtype=object)

They all map except for O2 Device, which CRRT people have while controls do not.

In [49]:
crrt_df[crrt_df["VITAL_SIGN_TYPE"] == "SpO2"]["VITAL_SIGN_VALUE"]

0        97
4        87
12       98
13       98
14       98
         ..
36517    98
36520    99
36522    98
36525    99
36530    99
Name: VITAL_SIGN_VALUE, Length: 4128, dtype: object

In [51]:
ctrl_df[ctrl_df["vital_sign_type"] == "PULSE OXIMETRY"]["vital_sign_value"]

18            96
19            96
32           100
33           100
38            98
            ... 
121784279    100
121784285    100
121784290    100
121784293    100
121784295    100
Name: vital_sign_value, Length: 29890094, dtype: object

In [53]:
crrt_df[crrt_df["VITAL_SIGN_TYPE"] == "O2 Device"]["VITAL_SIGN_VALUE"]

1                             Bi-PAP
29             Mechanical Ventilator
43                            Bi-PAP
46             None (Room air);Trach
57       Trach;Mechanical Ventilator
                    ...             
36498                None (Room air)
36507      ETT;Mechanical Ventilator
36508                None (Room air)
36519    Trach;Mechanical Ventilator
36533      ETT;Mechanical Ventilator
Name: VITAL_SIGN_VALUE, Length: 4115, dtype: object

# Diagnoses

In [3]:
pt = "1E6D759D88A19B0CFFE1F2EF2B4238CD"
fname = "Encounter_Diagnoses.txt"
crrt_df = read_file(f"{args.ucla_crrt_data_dir}/{fname}")
ctrl_df = read_file(f"{args.ucla_control_data_dir}/{fname}")

## time difference of different explode procedures

In [26]:
sample = crrt_df.sample(10000)
sample["VITAL_SIGN_TYPE"].replace(
        {"BP": "SBP/DBP", "BLOOD PRESSURE": "SBP/DBP"}, inplace=True
    )
explode_cols = ["VITAL_SIGN_VALUE", "VITAL_SIGN_TYPE"]

In [30]:
%%timeit -n 100

(
    sample.set_index(list(sample.columns.difference(explode_cols)))
        .apply(lambda col: col.str.split("/").explode())
        .reset_index()
        .reindex(sample.columns, axis=1)
)

22.5 ms ± 220 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [31]:
%%timeit  -n 100
b = (
	sample.apply(
		lambda col: col.str.split("/") 
		if col.name in explode_cols else col
	).explode(explode_cols)
)

14.2 ms ± 60.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [47]:
a = (
    sample.set_index(list(sample.columns.difference(explode_cols)))
        .apply(lambda col: col.str.split("/").explode())
        .reset_index()
        .reindex(sample.columns, axis=1)
)
b = (
	sample.apply(
		lambda col: col.str.split("/") 
		if col.name in explode_cols else col
	).explode(explode_cols)
    .reset_index(drop=True)  # reset index to be fair comparison
)
a.equals(b)

True

# Medications

In [6]:
crrt_df = read_file(f"{args.ucla_crrt_data_dir}/Medications.txt")
ctrl_df = read_file(f"{args.ucla_control_data_dir}/Medications.txt")

  df = read_csv(file)
  df = read_csv(file, on_bad_lines="skip")


In [7]:
crrt_df["PHARM_SUBCLASS"]

0          Heparins And Heparinoid-Like Agents
1                              Opioid Agonists
2                                          NaN
3                                       Sodium
4                                          NaN
                          ...                 
4286782     Bronchodilators - Anticholinergics
4286783          Imidazole-Related Antifungals
4286784          Imidazole-Related Antifungals
4286785                   Glucocorticosteroids
4286786                           Antitussives
Name: PHARM_SUBCLASS, Length: 4286787, dtype: object

In [8]:
ctrl_df["medispan_subclass_name"]

0                                   NaN
1                                   NaN
2                                   NaN
3                                   NaN
4                                   NaN
                       ...             
9414796                 Opioid Agonists
9414797    Miscellaneous Contrast Media
9414798                   Glycopeptides
9414799      Anaphylaxis Therapy Agents
9414800                Sympathomimetics
Name: medispan_subclass_name, Length: 9414801, dtype: object