- Immuno compromised

- Antibiotics

- Previous HAIs

- Stay Duration

- Other general HAI Features


# Step 0 - Setup & Retrieve data

In [None]:
# Integrates matplotlib with the notebook
%matplotlib inline

# Add all import for libraries will be using
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import os

from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors

# plotly imports
# !pip install plotly==5.10.0
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.io as pio
pio.renderers.default = "colab"

import warnings
warnings.filterwarnings('ignore')

# Import libraries
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from IPython.display import display, HTML, Image
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery
# Authenticate
auth.authenticate_user()

# define env for running this notebook
# True, if running it locally as a jupyter notebook
# Fase, if running it in Google Colab as a jupyter notebook
is_running_locally = False

# Set up a Jupyter Notebook environment to allow for offline plotting with Plotly
def enable_plotly_in_cell():
  import IPython
  from plotly.offline import init_notebook_mode
  display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
  init_notebook_mode(is_running_locally)

### Connect MIMIC DB
# Set up environment variables
project_id = 'ml-health-application'   ### Please change project_id
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
# if you want to use the demo, change this to mimic_demo
dataset = 'mimiciv'

### Retrieve HAI patients CSV
# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery
from google.colab import drive

# Mount data from google drive
# If data stored differently, this path must be changed to match location of the file
drive.mount('/content/gdrive')
file_name = "HAI Positive Patients.csv"
file_path = '/content/gdrive/MyDrive/' + file_name


Mounted at /content/gdrive


In [None]:
from pickle import NONE
# utils
from enum import Enum

class Type(Enum):
    ICDCodes = 1
    ICUStay = 2
    AdmidFirst = 3
    PatientAdmission = 4

def get_patient_query(terms, limit=True, override_base_query = NONE):
  join_icd_codes = """INNER JOIN ICDCodes c
  ON c.icd_code = d.icd_code"""
  join_icu_stay = """LEFT JOIN ICUStay stay
  ON d.hadm_id = stay.hadm_id and d.subject_id = stay.subject_id"""
  join_admid_first = """INNER JOIN AdmidFirst ad
  ON ad.hadm_id < d.hadm_id"""
  join_subject_admid_id = """INNER JOIN PatientAdmission patad
  ON patad.hadm_id = d.hadm_id and patad.hadm_id = d.hadm_id"""
  cluases = {
      Type.ICDCodes: join_icd_codes,
      Type.ICUStay: join_icu_stay,
      Type.AdmidFirst: join_admid_first,
      Type.PatientAdmission: join_subject_admid_id
  }

  base_patient_query = """SELECT *
  FROM `physionet-data.mimiciv_hosp.diagnoses_icd` d
  {}
  INNER JOIN `physionet-data.mimiciv_hosp.admissions` a
    ON d.hadm_id = a.hadm_id
  INNER JOIN `physionet-data.mimiciv_hosp.patients` p
    ON d.subject_id = p.subject_id
  {}"""

  if override_base_query is not NONE:
    base_patient_query = override_base_query

  join_cluases = '\n'.join([cluases[term] for term in terms])
  return base_patient_query.format(join_cluases, "LIMIT 1000" if limit else "")

# print(get_patient_query([Type.ICDCodes, Type.ICUStay]))

def generate_sql_with_ctes(cte_dict, main_query):
    """Generate a SQL query with CTEs."""

    # Convert the CTEs into a list of formatted strings
    cte_strings = [f"{name} AS ({query})" for name, query in cte_dict.items()]

    # Join the CTEs with commas and then prepend WITH only once
    ctes_combined = "WITH\n" + ',\n'.join(cte_strings)

    # Combine the formatted CTEs with the main query
    return ctes_combined + '\n' + main_query

def format_icd_codes(icd_codes):
    return str([code.replace('.', '') for code in icd_codes])[1:-1]

def generate_sql_with_icd_codes(icd_codes, limit=True):
    formatted_icd_codes = format_icd_codes(icd_codes)
    icd_diagnose_query = """SELECT *
    FROM `physionet-data.mimiciv_hosp.diagnoses_icd` d
    INNER JOIN `physionet-data.mimiciv_hosp.admissions` a
      ON d.hadm_id = a.hadm_id
    INNER JOIN `physionet-data.mimiciv_hosp.patients` p
      ON d.subject_id = p.subject_id
    WHERE d.icd_code in ({})
    {}"""
    return icd_diagnose_query.format(formatted_icd_codes, "LIMIT 1000" if limit else "")

def generate_sql_with_ctes(cte_dict, main_query):
    """Generate a SQL query with CTEs."""

    # Convert the CTEs into a list of formatted strings
    cte_strings = [f"{name} AS ({query})" for name, query in cte_dict.items()]

    # Join the CTEs with commas and then prepend WITH only once
    ctes_combined = "WITH\n" + ',\n'.join(cte_strings)

    # Combine the formatted CTEs with the main query
    return ctes_combined + '\n' + main_query

def add_like_query(query, terms, label="long_title"):

    # Create the SQL WHERE clause using the list of terms
    where_clauses = [f"lower({label}) LIKE '%{term}%'" for term in terms]
    where_combined = " OR ".join(where_clauses)

    # Combine all parts to generate the final SQL query
    sql_query = f"{query} WHERE {where_combined}"

    return sql_query

### Compromised Immune System

In [None]:
urinary_abnorm_terms = ["obstruction", "urinary retention", "congenital anomaly",
                    "vesicoureteral reflux", "bladder diverticulum"]


# ICD Diagnoses immune-compromising conditions such as Leukemia, HIV and Diabetes
# Patients with conditions like HIV, diabetes, or those receiving immunosuppressants are more susceptible to infections, including CAUTIs
immune_compromised_cte  = {}


immune_compromised_terms = [
    "hiv", "aids", "diabetes", "organ transplant", "leukemia", "lymphoma",
    "cancer", "chronic kidney disease", "end-stage renal disease", "splenectomy",
    "bone marrow transplant", "malnutrition", "congenital immune deficiencies"
]

immune_compromised_diagnoses = add_like_query("SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`", immune_compromised_terms)

immune_compromised_cte["ICDCodes"] = immune_compromised_diagnoses

immune_compromised_patients_sql = generate_sql_with_ctes(immune_compromised_cte, get_patient_query([Type.ICDCodes], False))
immune_compromised_patients = run_query(immune_compromised_patients_sql)
immune_compromised_patients

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,icd_code_1,long_title,subject_id_1,hadm_id_1,admittime,...,race,edregtime,edouttime,hospital_expire_flag,subject_id_2,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10003400,20214994,30,40390,9,40390,"Hypertensive chronic kidney disease, unspecifi...",10003400,20214994,2137-02-24 10:00:00,...,BLACK/AFRICAN AMERICAN,NaT,NaT,0,10003400,F,72,2134,2011 - 2013,2137-09-02
1,10024331,25768667,26,28521,9,28521,Anemia in chronic kidney disease,10024331,25768667,2144-09-07 15:17:00,...,WHITE - RUSSIAN,NaT,NaT,0,10024331,M,72,2140,2008 - 2010,2145-01-23
2,10031358,24522342,33,25072,9,25072,Diabetes with peripheral circulatory disorders...,10031358,24522342,2158-03-10 18:06:00,...,WHITE,2158-03-10 13:12:00,2158-03-10 19:51:00,0,10031358,M,58,2152,2008 - 2010,NaT
3,10031358,24522342,35,40390,9,40390,"Hypertensive chronic kidney disease, unspecifi...",10031358,24522342,2158-03-10 18:06:00,...,WHITE,2158-03-10 13:12:00,2158-03-10 19:51:00,0,10031358,M,58,2152,2008 - 2010,NaT
4,10031358,24522342,36,5859,9,5859,"Chronic kidney disease, unspecified",10031358,24522342,2158-03-10 18:06:00,...,WHITE,2158-03-10 13:12:00,2158-03-10 19:51:00,0,10031358,M,58,2152,2008 - 2010,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302789,19918413,21520010,25,E11649,10,E11649,Type 2 diabetes mellitus with hypoglycemia wit...,19918413,21520010,2129-09-07 16:00:00,...,BLACK/AFRICAN AMERICAN,2129-09-07 12:11:00,2129-09-07 17:46:00,0,19918413,M,75,2129,2017 - 2019,2130-12-18
302790,19919213,28914454,25,N189,10,N189,"Chronic kidney disease, unspecified",19919213,28914454,2203-09-13 17:06:00,...,WHITE - OTHER EUROPEAN,2203-09-13 12:42:00,2203-09-13 18:39:00,0,19919213,M,83,2194,2008 - 2010,2203-09-29
302791,19944094,20794406,25,E11621,10,E11621,Type 2 diabetes mellitus with foot ulcer,19944094,20794406,2163-11-12 01:32:00,...,WHITE,2163-11-11 20:57:00,2163-11-12 02:12:00,1,19944094,M,91,2163,2017 - 2019,2163-11-25
302792,19964153,20368705,25,E1040,10,E1040,Type 1 diabetes mellitus with diabetic neuropa...,19964153,20368705,2135-12-28 02:40:00,...,WHITE,2135-12-27 20:08:00,2135-12-28 07:17:00,1,19964153,M,71,2131,2014 - 2016,2136-01-11


### Antibiotics

list of some commonly used immunosuppressant drugs that could potentially increase the risk of HAIs:

1.Corticosteroids: These are a class of drugs that include:

* Prednisone
* Methylprednisolone (Medrol)
* Hydrocortisone
* Dexamethasone
2.Calcineurin Inhibitors: These drugs block the action of calcineurin, which plays a role in the activation of T cells.

* Cyclosporine (Neoral, Sandimmune, Gengraf)
* Tacrolimus (Prograf)

3.mTOR Inhibitors (Mammalian Target of Rapamycin): These drugs block the action of mTOR, a protein involved in cell multiplication.

* Sirolimus (Rapamune)
* Everolimus (Zortress)
4.Antiproliferative/Antimetabolite Agents: These prevent the synthesis of DNA and thus prevent the proliferation of cells.

* Mycophenolate mofetil (CellCept)
* Mycophenolate sodium (Myfortic)
* Azathioprine (Imuran, Azasan)
* Methotrexate
5. Biologic Agents: These are antibodies or related proteins that target specific parts of the immune system.

* Infliximab (Remicade)
* Adalimumab (Humira)
* Rituximab (Rituxan, MabThera)
* Abatacept (Orencia)
* Etanercept (Enbrel)
* Ustekinumab (Stelara)
6. Antithymocyte Globulin (ATG):

* Thymoglobulin
* Atgam
7.Other Agents:

* Cyclophosphamide
* Chlorambucil


In [None]:
# patients received immunosuppressive prescriptions
immunosuppressive_prescriptions = [
    "prednisone" ,"methylprednisolone" ,"hydrocortisone" ,"dexamethasone" ,"cyclosporine" ,"tacrolimus" ,"sirolimus" ,"everolimus" ,
    "mycophenolate mofetil" ,"mycophenolate sodium" ,"azathioprine" ,"methotrexate" ,"infliximab" ,"adalimumab" ,"rituximab" ,
    "abatacept" ,"etanercept" ,"ustekinumab" ,"thymoglobulin" ,"atgam" ,"cyclophosphamide" ,"chlorambucil"
]

precriptions_query = "SELECT DISTINCT subject_id, hadm_id, drug FROM `physionet-data.mimiciv_hosp.prescriptions`"
immunosuppressive_prescription_patients = add_like_query(precriptions_query, immunosuppressive_prescriptions, "drug")
patient_immunosuppressant_drugs = run_query(immunosuppressive_prescription_patients)
patient_immunosuppressant_drugs

In [None]:
from google.colab import files
patient_immunosuppressant_drugs.to_csv('patient_immunosuppressant_drugs.csv')
files.download('patient_immunosuppressant_drugs.csv')

### HAI Patients

In [None]:
hai_patients_csv = pd.read_csv(file_path)
hai_patients_csv

In [None]:
hai_icd_codes = hai_patients_csv['icd_code'].unique()
hai_icd_diagnoses = "SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses` where icd_code in ({})".format(format_icd_codes(hai_icd_codes))
# get patients with HAIs ICDCodes
hai_patients = run_query(generate_sql_with_icd_codes(hai_icd_codes))

In [None]:
plt.hist(hai_patients_csv['HAI_Type'])

In [None]:
all_pat_cte = {}
all_hai_diagnoses = add_like_query("SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`", immune_compromised_terms)

all_pat_cte["PatientAdmission"] = all_hai_diagnoses

immune_compromised_patients_sql = generate_sql_with_ctes(all_pat_cte, get_patient_query([Type.PatientAdmission]))
immune_compromised_patients = run_query(immune_compromised_patients_sql)
immune_compromised_patients
PatientAdmission

### Stay Duration

In [None]:
# icu stay for immune_compromised_patient
icu_stay_query = "SELECT subject_id, hadm_id, stay_id, los FROM `physionet-data.mimiciv_icu.icustays`"
patients_with_los_cte = {}
patients_with_los_cte["ICUStay"] = icu_stay_query
patients_with_los_cte["ICDCodes"] = immune_compromised_diagnoses

patients_with_immune_compromised_los_sql = generate_sql_with_ctes(patients_with_los_cte, get_patient_query([Type.ICUStay, Type.ICDCodes], False))
patients_with_immune_compromised_los = run_query(patients_with_immune_compromised_los_sql)
patients_with_immune_compromised_los.columns


Index(['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version',
       'subject_id_1', 'hadm_id_1', 'stay_id', 'los', 'icd_code_1',
       'long_title', 'subject_id_2', 'hadm_id_2', 'admittime', 'dischtime',
       'deathtime', 'admission_type', 'admit_provider_id',
       'admission_location', 'discharge_location', 'insurance', 'language',
       'marital_status', 'race', 'edregtime', 'edouttime',
       'hospital_expire_flag', 'subject_id_3', 'gender', 'anchor_age',
       'anchor_year', 'anchor_year_group', 'dod'],
      dtype='object')

In [None]:
# patients_with_immune_compromised_los = patients_with_immune_compromised_los.drop([ 'subject_id_1', 'hadm_id_1', 'icd_code_1', 'subject_id_2', 'hadm_id_2',  'subject_id_3'], axis = 1)
patients_with_immune_compromised_los.columns


Index(['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version',
       'subject_id_1', 'hadm_id_1', 'stay_id', 'los', 'icd_code_1',
       'long_title', 'subject_id_2', 'hadm_id_2', 'admittime', 'dischtime',
       'deathtime', 'admission_type', 'admit_provider_id',
       'admission_location', 'discharge_location', 'insurance', 'language',
       'marital_status', 'race', 'edregtime', 'edouttime',
       'hospital_expire_flag', 'subject_id_3', 'gender', 'anchor_age',
       'anchor_year', 'anchor_year_group', 'dod'],
      dtype='object')

In [None]:
from google.colab import files
column_to_include = ["subject_id",	"hadm_id", "stay_id", "icd_code", "long_title"]
immune_compromised_patients_shorten = patients_with_immune_compromised_los[column_to_include]
immune_compromised_patients_shorten.to_csv('patient_immune_compromised.csv', columns=column_to_include)
files.download('patient_immune_compromised.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# icu stay for HAI patients
icu_stay_query = "SELECT subject_id, hadm_id, stay_id los FROM `physionet-data.mimiciv_icu.icustays`"
patients_with_los_cte = {}
patients_with_los_cte["ICUStay"] = icu_stay_query
patients_with_los_cte["ICDCodes"] = hai_icd_diagnoses

patients_with_los = generate_sql_with_ctes(patients_with_los_cte, get_patient_query([Type.ICUStay, Type.ICDCodes], False))
patients_with_hai_los = run_query(patients_with_los)
patients_with_hai_los

In [None]:
patients_with_hai_los = patients_with_hai_los.drop([ 'subject_id_1', 'hadm_id_1', 'icd_code_1', 'subject_id_2', 'hadm_id_2',  'subject_id_3'], axis = 1)
from google.colab import files
patients_with_hai_los.to_csv('patients_with_hai_los.csv')
files.download('patients_with_hai_los.csv')

In [None]:
all_patient_query = '''
SELECT subject_id, hadm_id FROM `comp90089-hai-patients.hai_patients.central_line_patients`
FULL OUTER JOIN `comp90089-hai-patients.hai_patients.ventilator_patients` USING (subject_id, hadm_id)
FULL OUTER JOIN `comp90089-hai-patients.hai_patients.urinary_catheter_patients` USING (subject_id, hadm_id)
FULL OUTER JOIN `comp90089-hai-patients.hai_patients.hai_positive_patients` USING (subject_id, hadm_id)
FULL OUTER JOIN `comp90089-hai-patients.hai_patients.patient_immune_compromised` USING (subject_id, hadm_id)
'''
# icu stay for all patients
patients_with_los_cte = {}
icu_stay_query = "SELECT subject_id, hadm_id, stay_id, los FROM `physionet-data.mimiciv_icu.icustays`"
patients_with_los_cte["ICUStay"] = icu_stay_query
patients_with_los_cte["PatientAdmission"] = all_patient_query

override_patient_query = """SELECT *
FROM `physionet-data.mimiciv_hosp.admissions` d
{}
INNER JOIN `physionet-data.mimiciv_hosp.patients` p
  ON d.subject_id = p.subject_id
{}"""

patients_detail_query = generate_sql_with_ctes(patients_with_los_cte, get_patient_query([Type.ICUStay, Type.PatientAdmission], False, override_patient_query))
patients_detail = run_query(patients_detail_query)
patients_detail

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,...,stay_id,los,subject_id_2,hadm_id_2,subject_id_3,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10106244,26713233,2147-05-09 10:34:00,2147-05-12 13:43:00,NaT,DIRECT EMER.,,PHYSICIAN REFERRAL,HOME,Other,...,34344296,2.548287,10106244,26713233,10106244,F,60,2144,2011 - 2013,NaT
1,15443666,27961368,2168-12-30 23:30:00,2169-01-05 16:02:00,NaT,OBSERVATION ADMIT,,EMERGENCY ROOM,HOME HEALTH CARE,Other,...,39190279,0.707882,15443666,27961368,15443666,F,75,2167,2014 - 2016,NaT
2,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,NaT,OBSERVATION ADMIT,,EMERGENCY ROOM,HOSPICE,Medicare,...,,,16299919,26977065,16299919,F,91,2189,2011 - 2013,2194-02-03
3,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,NaT,OBSERVATION ADMIT,,EMERGENCY ROOM,HOSPICE,Medicare,...,,,16299919,26977065,16299919,F,91,2189,2011 - 2013,2194-02-03
4,16299919,26977065,2193-05-15 08:37:00,2193-05-17 16:03:00,NaT,OBSERVATION ADMIT,,EMERGENCY ROOM,HOSPICE,Medicare,...,,,16299919,26977065,16299919,F,91,2189,2011 - 2013,2194-02-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749381,19956654,22281279,2137-06-26 13:07:00,2137-06-27 12:15:00,NaT,AMBULATORY OBSERVATION,P99YFT,PROCEDURE SITE,,Medicare,...,,,19956654,22281279,19956654,M,74,2136,2011 - 2013,2139-01-25
749382,19956654,22281279,2137-06-26 13:07:00,2137-06-27 12:15:00,NaT,AMBULATORY OBSERVATION,P99YFT,PROCEDURE SITE,,Medicare,...,,,19956654,22281279,19956654,M,74,2136,2011 - 2013,2139-01-25
749383,19956654,22281279,2137-06-26 13:07:00,2137-06-27 12:15:00,NaT,AMBULATORY OBSERVATION,P99YFT,PROCEDURE SITE,,Medicare,...,,,19956654,22281279,19956654,M,74,2136,2011 - 2013,2139-01-25
749384,12104929,28540937,2162-07-12 22:48:00,2162-07-17 16:10:00,NaT,EW EMER.,P99YTS,EMERGENCY ROOM,HOME HEALTH CARE,Medicare,...,32478117,2.794155,12104929,28540937,12104929,F,91,2161,2008 - 2010,2166-06-12


In [None]:
patients_detail['los'] = patients_detail['los'].apply(lambda x: np.ceil(x) if not pd.isna(x) and not np.isinf(x) else x).astype('Int64')

In [None]:
patients_detail.columns
patients_detail = patients_detail.drop(["hadm_id_1", "hadm_id_2", "subject_id_1",  "subject_id_2", "subject_id_3"], axis=1)

In [None]:
patients_detail.columns

Index(['subject_id', 'hadm_id', 'admittime', 'dischtime', 'deathtime',
       'admission_type', 'admit_provider_id', 'admission_location',
       'discharge_location', 'insurance', 'language', 'marital_status', 'race',
       'edregtime', 'edouttime', 'hospital_expire_flag', 'stay_id', 'los',
       'gender', 'anchor_age', 'anchor_year', 'anchor_year_group', 'dod'],
      dtype='object')

In [None]:
from google.colab import files
patients_detail.to_csv('patient_detail.csv')
files.download('patient_detail.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Step 1 - Data Preprocessing


## Select only interested fields

In [None]:
immune_compromised_patients = immune_compromised_patients[['subject_id', 'hadm_id', 'icd_code', 'race', 'gender', 'anchor_age', 'dod']]
patients_with_immune_compromised_los = patients_with_immune_compromised_los[['subject_id', 'hadm_id', 'icd_code', 'race', 'gender', 'anchor_age', 'dod', 'los']]
patients_with_hai_los = patients_with_hai_los[['subject_id', 'hadm_id', 'icd_code', 'race', 'gender', 'anchor_age', 'dod', 'los']]

## Check for missing values

In [None]:
print("hai_patients")
print(hai_patients_csv.isnull().sum())
print("total:", hai_patients_csv.shape[0])

print("\nimmune_compromised_patients")
print(immune_compromised_patients.isnull().sum())
print("total:", immune_compromised_patients.shape[0])

print("\npatient_immunosuppressant_drugs")
print(patient_immunosuppressant_drugs.isnull().sum())
print("total:", patient_immunosuppressant_drugs.shape[0])

print("\npatients_with_immune_compromised_los")
print(patients_with_immune_compromised_los.isnull().sum())
print("total:", patients_with_immune_compromised_los.shape[0])

# derived based on the icd codes from csv file
print("\npatients_with_hai_los")
print(patients_with_hai_los.isnull().sum())
print("total:", patients_with_hai_los.shape[0])

## Preview

In [None]:
hai_patients_csv.head()

In [None]:
immune_compromised_patients.head()

In [None]:
patient_immunosuppressant_drugs.head()

In [None]:
patients_with_immune_compromised_los.head()

## Convert categorical to numberical format
`survived` is 1 if `dod` is not existed, 0 otherwise
`gender_num` is 1 if `gender`='M', 0 if gender='F'

In [None]:
race_mapping = {
    'BLACK/AFRICAN AMERICAN': 0,
    'WHITE - RUSSIAN': 1,
    'WHITE': 2,
    'UNKNOWN': 3,
    'OTHER': 4,
    'HISPANIC OR LATINO': 5,
    'HISPANIC/LATINO - CENTRAL AMERICAN': 6,
    'WHITE - OTHER EUROPEAN': 7,
    'PORTUGUESE': 8,
    'UNABLE TO OBTAIN': 9,
    'HISPANIC/LATINO - HONDURAN': 10,
    'BLACK/CARIBBEAN ISLAND': 11,
    'HISPANIC/LATINO - PUERTO RICAN': 12,
    'ASIAN': 13,
    'ASIAN - CHINESE': 14,
    'BLACK/AFRICAN': 15,
    'HISPANIC/LATINO - DOMINICAN': 16,
    'AMERICAN INDIAN/ALASKA NATIVE': 17,
    'SOUTH AMERICAN': 18,
    'PATIENT DECLINED TO ANSWER': 19,
    'ASIAN - SOUTH EAST ASIAN': 20,
    'MULTIPLE RACE/ETHNICITY': 21,
    'BLACK/CAPE VERDEAN': 22,
    'ASIAN - ASIAN INDIAN': 23
}


def convert_to_numerical_values(df):
  # convert to numerical values
  df['survived'] = pd.to_datetime(df['dod'], errors='coerce').isna()
  df['gender_num'] = df['gender'].map({'F': 0, 'M': 1})
  df['race'] = df['race'].map(race_mapping)

  # round up LOS
  if 'los' in df.columns:
    df['los'] = df['los'].round(0).astype(int)

  # drop unused field
  df = df.drop(['dod', 'gender'], axis = 1)

  return df.round()

immune_compromised_patients = convert_to_numerical_values(immune_compromised_patients)
patients_with_immune_compromised_los = convert_to_numerical_values(patients_with_immune_compromised_los)
patients_with_hai_los = convert_to_numerical_values(patients_with_hai_los)

In [None]:
immune_compromised_patients.head()

In [None]:
patients_with_immune_compromised_los.head()

# Step 2 - Feature Engineering
Feature Engineering: Create new features based on existing ones to improve model performance.

E.g. calculate infection rates within the first 48 hours of hospital admission.


# Step 3 - Exploratory Data Analysis (EDA)

Statistical Analysis: Descriptive statistics, correlations, etc.

Data Visualisation: Graphical representation of data distribution, relationships, etc.

Feature Importance Analysis: Identify which features are likely to have the most impact on the model. (Preliminary Tree Based Models)

## Relation between ICD and each HAI Type

In [None]:
# Total unique patients
print("Total HAI patients:", len(hai_patients_csv.drop_duplicates(subset=['subject_id'])))
print("Total Immune Compromised Patients patients:", len(immune_compromised_patients.drop_duplicates(subset=['subject_id'])))

In [None]:
# Matched immune_compromised_patients with HAI patient
matching_immune_compromised_and_hai_patients = set(hai_patients_csv['subject_id']).intersection(immune_compromised_patients['subject_id'])
len(matching_immune_compromised_and_hai_patients)

In [None]:
# combined icd

In [None]:
unique_icd_immune_compromised = np.array(immune_compromised_patients['icd_code'].unique())

# Pivot the DataFrame to get the desired format
df = pd.DataFrame(immune_compromised_patients)
pivot_df = df.pivot_table(index=['subject_id'],
                            columns='icd_code',
                            aggfunc='size',
                            fill_value=0).reset_index()
# Create custom column names with 'icd_' prefix
custom_column_names = ['icd_' + code for code in unique_icd_immune_compromised]

# Reset the index to match your desired output
pivot_df.columns.name = None
pivot_df = pivot_df.reset_index()
pivot_df = pivot_df.rename(columns=dict(zip(unique_icd_immune_compromised, custom_column_names)))
pivot_df

In [None]:
# Pivot the DataFrame to get the desired format
df2 = pd.DataFrame(hai_patients_csv)
pivot_df2 = df2.pivot_table(index=['subject_id'],
                            columns='HAI_Type',
                            aggfunc='size',
                            fill_value=0).reset_index()
# Create custom column names with 'icd_' prefix
custom_column_names = ['type_' + code for code in hai_patients_csv['HAI_Type'].unique()]

# Reset the index to match your desired output
pivot_df2.columns.name = None
pivot_df2 = pivot_df2.reset_index()
pivot_df2 = pivot_df2.rename(columns=dict(zip(hai_patients_csv['HAI_Type'].unique(), custom_column_names)))
pivot_df2

In [None]:
merged_df = pd.merge(pivot_df, pivot_df2, on='subject_id', how='inner')
type_columns = merged_df.filter(like='type_')

# if value in each icd_xx or type_xx > 1, set it to 1
merged_df[merged_df.filter(like='type_') > 1] = 1
merged_df[merged_df.filter(like='icd_') > 1] = 1

merged_df

In [None]:
merged_df_VAP = merged_df[merged_df['type_VAP'] == 1]
merged_df_CLABSI = merged_df[merged_df['type_CLABSI'] == 1]
merged_df_CAUTI = merged_df[merged_df['type_CAUTI'] == 1]

In [None]:
type_columns = merged_df.filter(like='type_')
icd_columns = merged_df.filter(like='icd_')

# Calculate the correlation matrix
correlation_matrix_CLABSI = merged_df_CLABSI.corr()
correlation_matrix_CAUTI = merged_df_CAUTI.corr()
correlation_matrix_VAP = merged_df_VAP.corr()

In [None]:
def plot_correlation(correlation_matrix, hai_type):
  # Create individual correlation heatmaps for each 'type_' column
    plt.figure(figsize=(6, 4))
    cmap = sns.color_palette(['grey', 'blue', 'red'])
    sns.heatmap(correlation_matrix, annot=False, cmap=cmap, linewidths=0.5)
    plt.title(f'Correlation Heatmap for {hai_type}')
    plt.show()
plot_correlation(correlation_matrix_CLABSI, 'type_CLABSI')
plot_correlation(correlation_matrix_CAUTI, 'type_CAUTI')
plot_correlation(correlation_matrix_VAP, 'type_VAP')

## Related Antibiotics

In [None]:
# pick antibiotic records from immuned compromised patient but not HAI
# patient_immunosuppressant_drugs hai_patients_csv immune_compromised_patients
result_df = patient_immunosuppressant_drugs.merge(immune_compromised_patients, on=['subject_id', 'hadm_id'], how='inner')
result_df = result_df.merge(hai_patients_csv, on=['subject_id', 'hadm_id'], how='inner')
result_df

In [None]:
result_df.columns

In [None]:
distinct_records_df = result_df[['subject_id', 'hadm_id', 'drug', 'icd_code_x', 'HAI_Type']]
distinct_records_df = distinct_records_df.drop_duplicates()
distinct_records_df

# antibiotics that matched both HAI and immuned compromised patients

In [None]:
df3 = pd.DataFrame(distinct_records_df)

# Group by 'drug' and 'HAI_Type' and count the occurrences
counts = df3.groupby(['drug', 'HAI_Type']).size().reset_index(name='count')
#pick only those drug count > 4
counts = counts[counts['count'] > 4]
# Create a bar plot
plt.figure(figsize=(12, 20))
plt.bar(
    counts['drug'] + ' - ' + counts['HAI_Type'],  # x-axis labels
    counts['count'],                             # heights of bars
    color='blue',                                # color of bars
)

# Add labels and title
plt.xlabel('Drug - HAI_Type')
plt.ylabel('Count')
plt.title('Count of Duplicated Records by Drug and HAI_Type')

# Show the plot
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()       # Ensure labels fit in the plot
plt.show()

# Note