In [None]:
!pip install awswrangler

In [None]:
!pip install fastparquet

In [306]:
import boto3
import pandas as pd
import numpy as np
import re
from sagemaker import get_execution_role
import awswrangler as wr
import os, sys
role = get_execution_role()
s3 = boto3.resource('s3')

In [307]:
icusics_db_patients = wr.s3.read_parquet(path="s3://icusics-db/patients/patients.parquet")


In [308]:
icd_codes=[]
for h in icusics_db_patients.hospital_coded.unique():
    diagnoses = wr.s3.read_parquet(path="s3://icusics-db/diagnoses/diagnoses_h%s.parquet"%h)
    icd_codes.append(diagnoses)

icd_codes= pd.concat(icd_codes)


DIABETES II, I, other

In [309]:
name = "diabetes"

In [310]:
idx = np.where(icd_codes.referencecodename.str.contains("Diabetis mellitus|diabetes mellitus|Diabetis", case=False))[0]
comorbidity = icd_codes.iloc[idx].drop_duplicates(subset=["referencecode"])

In [311]:
comorbidity= list(comorbidity.referencecode.unique())
len(comorbidity)

166

In [312]:
comorbidity = icd_codes[icd_codes["referencecode"].isin(comorbidity)]

In [313]:
comorbidity.a_patientid.nunique()

2298

In [314]:
dm= comorbidity.groupby("a_patientid").referencecode.count().reset_index().rename(columns={"referencecode": name})

In [315]:
dm[name] = 1
dm

Unnamed: 0,a_patientid,diabetes
0,1001773,1
1,1002304,1
2,1002515,1
3,1002635,1
4,1003351,1
...,...,...
2293,6960538,1
2294,6980223,1
2295,6980759,1
2296,6982644,1


HEART FAILURE


In [316]:
name ='heart failure'

In [317]:
#idx = np.where(icd_codes.referencecodename.str.contains("insuficiencia cardiaca|atac de cor|heart|Cardiopatia|card|Insuficiència cardíaca", case=False))[0]
#comorbidity = icd_codes.iloc[idx].drop_duplicates(subset=["referencecode"])

In [318]:
comorbidity=icd_codes[icd_codes["referencecode"].str.startswith(("I50", "428", "408", "I11"))] # codes in the excel file "Predictors_IDS"

In [319]:
comorbidity.a_patientid.nunique()

625

In [320]:
hf= comorbidity.groupby("a_patientid").referencecode.count().reset_index().rename(columns={"referencecode": name})

In [321]:
hf[name] = 1

In [322]:
hf.a_patientid.nunique()

625

HYPERTENSION

In [323]:
name ='hypertension'

In [324]:
comorbidity=icd_codes[icd_codes["referencecode"].str.startswith(("I10", "401", "402", "I15","I13", "I11"))] # codes in the excel file "Predictors_IDS"

In [325]:
comorbidity.a_patientid.nunique()

4346

In [326]:
hyp= comorbidity.groupby("a_patientid").referencecode.count().reset_index().rename(columns={"referencecode": name})

In [327]:
hyp[name] = 1

In [328]:
hyp.a_patientid.nunique()

4346

CKD

In [329]:
name ='chronic kidney disease'

In [330]:
comorbidity=icd_codes[icd_codes["referencecode"].str.startswith(("I13", "N18", "585", "D63"))] # codes in the excel file "Predictors_IDS"

In [331]:
ids_to_exclude = ["585.6","585.6/2"]
comorbidity=comorbidity[~comorbidity.referencecode.isin(ids_to_exclude)]

In [332]:
comorbidity.a_patientid.nunique()

903

In [333]:
ckd= comorbidity.groupby("a_patientid").referencecode.count().reset_index().rename(columns={"referencecode": name})

In [334]:
ckd[name] = 1

In [335]:
ckd.a_patientid.nunique()

903

RRT

In [336]:
name ='RRT'

In [337]:
comorbidity=icd_codes[icd_codes["referencecode"].isin(["N18.6","Z99.2"])] # codes in the excel file "Predictors_IDS"

In [338]:
comorbidity.a_patientid.nunique()

10

In [339]:
rrt= comorbidity.groupby("a_patientid").referencecode.count().reset_index().rename(columns={"referencecode": name})

In [340]:
rrt[name] = 1

In [341]:
rrt.a_patientid.nunique()

10

MERGE ALL COMORBIDITIES AND CREATE A DATAFRAME WITH BOOLEAN COLUMNS INDINCATING THE PRESENCE OR NOT OF EACH COMORBIDITY

In [342]:
comorbidity = dm.merge(hf, how="outer")
comorbidity = comorbidity.merge(hyp, how="outer")
comorbidity = comorbidity.merge(ckd, how="outer")
comorbidity = comorbidity.merge(rrt, how="outer")

comorbidity = comorbidity.fillna(0)
comorbidity

Unnamed: 0,a_patientid,diabetes,heart failure,hypertension,chronic kidney disease,RRT
0,1001773,1.0,0.0,1.0,1.0,0.0
1,1002304,1.0,1.0,1.0,1.0,0.0
2,1002515,1.0,0.0,0.0,0.0,0.0
3,1002635,1.0,0.0,0.0,0.0,0.0
4,1003351,1.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...
5705,6834415,0.0,0.0,0.0,1.0,0.0
5706,6858536,0.0,0.0,0.0,1.0,0.0
5707,6882314,0.0,0.0,0.0,1.0,0.0
5708,6924180,0.0,0.0,0.0,1.0,0.0


In [343]:
comorbidity = comorbidity.merge(icusics_db_patients[["a_patientid", "hospital_coded"]], how="inner")
comorbidity

Unnamed: 0,a_patientid,diabetes,heart failure,hypertension,chronic kidney disease,RRT,hospital_coded
0,1001773,1.0,0.0,1.0,1.0,0.0,1
1,1002304,1.0,1.0,1.0,1.0,0.0,1
2,1002515,1.0,0.0,0.0,0.0,0.0,1
3,1002635,1.0,0.0,0.0,0.0,0.0,1
4,1003351,1.0,0.0,1.0,1.0,0.0,1
...,...,...,...,...,...,...,...
5705,6834415,0.0,0.0,0.0,1.0,0.0,6
5706,6858536,0.0,0.0,0.0,1.0,0.0,6
5707,6882314,0.0,0.0,0.0,1.0,0.0,6
5708,6924180,0.0,0.0,0.0,1.0,0.0,6


N° of patients with comorbidities over differnt hospitals

In [344]:
comorbidity.hospital_coded.value_counts()

3    1931
1    1768
4     943
2     647
6     385
5      36
Name: hospital_coded, dtype: Int64

In [345]:
comorbidity.to_csv("comorbidities_all_hosp.csv", index=False)