<a href="https://colab.research.google.com/github/tronglinux123/AI-Project/blob/main/finalisedcohortdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-cloud-bigquery



In [None]:
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from IPython.display import display, HTML, Image
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery
from google.colab import drive

In [None]:
auth.authenticate_user()
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Set up environment variables
project_id = 'project-physionet-data'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
dataset = 'mimiciv'


# Add all the additional features to our cohort of **patients**

In [None]:
data_x_ray = pd.read_csv('/content/drive/MyDrive/doan_demo/x_ray_data.csv')

In [None]:
data = data_x_ray[['subject_id', 'hadm_id', 'time']]
data = data.drop(data.index[-1])
data

Unnamed: 0,subject_id,hadm_id,time
0,18415616,29138337,2164-04-28T00:00:00
1,19136768,26451475,2151-04-08T00:00:00
2,19136768,27903990,2149-07-22T00:00:00
3,19136768,29901658,2150-12-24T00:00:00
4,10554112,25105282,2154-08-26T00:00:00
...,...,...,...
29077,10940509,26533493,2201-01-01T00:00:00
29078,19025237,20041142,2171-07-24T00:00:00
29079,14733367,21448520,2148-03-26T00:00:00
29080,16117323,24175963,2191-11-16T00:00:00


### **Diabete**

In [None]:
# Find ICD code related to diabete
df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_2_2_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%diabete%"
""")
print(df)

# ICD code list
icd_codes_diabete = df['icd_code'].to_list()
icd_codes_string_diabete = ', '.join(["'" + icd + "'" for icd in icd_codes_diabete])
print(icd_codes_string_diabete)

  return pd.io.gbq.read_gbq(


    icd_code                                         long_title
0      24900  Secondary diabetes mellitus without mention of...
1      24901  Secondary diabetes mellitus without mention of...
2      24910  Secondary diabetes mellitus with ketoacidosis,...
3      24911  Secondary diabetes mellitus with ketoacidosis,...
4      24920  Secondary diabetes mellitus with hyperosmolari...
..       ...                                                ...
709     P702                         Neonatal diabetes mellitus
710    R7303                                        Prediabetes
711     Z131      Encounter for screening for diabetes mellitus
712     Z833                Family history of diabetes mellitus
713    Z8632           Personal history of gestational diabetes

[714 rows x 2 columns]
'24900', '24901', '24910', '24911', '24920', '24921', '24930', '24931', '24940', '24941', '24950', '24951', '24960', '24961', '24970', '24971', '24980', '24981', '24990', '24991', '25000', '25001', '25002', '

In [None]:
# Subject ID that have diabete
df_diabete = run_query(f"""
SELECT `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.subject_id, `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.hadm_id
FROM `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`
WHERE `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string_diabete})
""")
print(df_diabete)

  return pd.io.gbq.read_gbq(


        subject_id   hadm_id
0         10031358  24522342
1         10031358  24522342
2         10031687  25318200
3         10031687  25318200
4         10035168  26166196
...            ...       ...
149896    19907884  27481511
149897    19917249  26653646
149898    19918413  21520010
149899    19944094  20794406
149900    19964153  20368705

[149901 rows x 2 columns]


In [None]:
print(df_diabete.columns)

Index(['subject_id', 'hadm_id'], dtype='object')


In [None]:
print(data.columns)

Index(['subject_id', 'hadm_id', 'time'], dtype='object')


In [None]:
data

Unnamed: 0,subject_id,hadm_id,time
0,18415616,29138337,2164-04-28T00:00:00
1,19136768,26451475,2151-04-08T00:00:00
2,19136768,27903990,2149-07-22T00:00:00
3,19136768,29901658,2150-12-24T00:00:00
4,10554112,25105282,2154-08-26T00:00:00
...,...,...,...
29077,10940509,26533493,2201-01-01T00:00:00
29078,19025237,20041142,2171-07-24T00:00:00
29079,14733367,21448520,2148-03-26T00:00:00
29080,16117323,24175963,2191-11-16T00:00:00


In [None]:
# Add the 'diabete' feature to df
data['diabete'] = data['subject_id'].isin(df_diabete['subject_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete
0        18415616  29138337  2164-04-28T00:00:00        0
1        19136768  26451475  2151-04-08T00:00:00        1
2        19136768  27903990  2149-07-22T00:00:00        1
3        19136768  29901658  2150-12-24T00:00:00        1
4        10554112  25105282  2154-08-26T00:00:00        1
...           ...       ...                  ...      ...
29077    10940509  26533493  2201-01-01T00:00:00        0
29078    19025237  20041142  2171-07-24T00:00:00        1
29079    14733367  21448520  2148-03-26T00:00:00        1
29080    16117323  24175963  2191-11-16T00:00:00        1
29081    13312176  20763054  2136-07-28T00:00:00        0

[29082 rows x 4 columns]


### **Age**

In [None]:
df_age = run_query("""
SELECT a.age, a.hadm_id
FROM `physionet-data.mimiciv_2_2_derived.age` a
""")
print(df_age)

  return pd.io.gbq.read_gbq(


        age   hadm_id
0        18  24990817
1        18  23975373
2        18  23815664
3        18  24431717
4        18  28991767
...     ...       ...
431226   92  20026585
431227   92  20847686
431228   92  28546062
431229   92  25716403
431230   92  23116022

[431231 rows x 2 columns]


In [None]:
# Merge the two DataFrames based on 'subject_id'
data = data.merge(df_age, on='hadm_id', how='left')

# Drop duplicate 'subject_id' rows, keeping only the last age occurrence
data = data.drop_duplicates(subset='hadm_id', keep='last')

print(data)

       subject_id   hadm_id                 time  diabete  age
0        18415616  29138337  2164-04-28T00:00:00        0   50
1        19136768  26451475  2151-04-08T00:00:00        1   77
2        19136768  27903990  2149-07-22T00:00:00        1   75
3        19136768  29901658  2150-12-24T00:00:00        1   76
4        10554112  25105282  2154-08-26T00:00:00        1   39
...           ...       ...                  ...      ...  ...
29077    10940509  26533493  2201-01-01T00:00:00        0   80
29078    19025237  20041142  2171-07-24T00:00:00        1   57
29079    14733367  21448520  2148-03-26T00:00:00        1   78
29080    16117323  24175963  2191-11-16T00:00:00        1   68
29081    13312176  20763054  2136-07-28T00:00:00        0   32

[29082 rows x 5 columns]


### **HIV**

In [None]:
# Find ICD code related to HIV
df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_2_2_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%hiv%"
""")
print(df)

# ICD code list
icd_codes_hiv = df['icd_code'].to_list()
icd_codes_string_hiv = ', '.join(["'" + icd + "'" for icd in icd_codes_hiv])
print(icd_codes_string_hiv)

  return pd.io.gbq.read_gbq(


   icd_code                                         long_title
0       042         Human immunodeficiency virus [HIV] disease
1     07953       Human immunodeficiency virus, type 2 [HIV-2]
2     79571  Nonspecific serologic evidence of human immuno...
3       V08  Asymptomatic human immunodeficiency virus [HIV...
4     V6544      Human immunodeficiency virus (HIV) counseling
5       B20         Human immunodeficiency virus [HIV] disease
6     B9735  Human immunodeficiency virus, type 2 [HIV 2] a...
7      O987  Human immunodeficiency virus [HIV] disease com...
8     O9871  Human immunodeficiency virus [HIV] disease com...
9    O98711  Human immunodeficiency virus [HIV] disease com...
10   O98712  Human immunodeficiency virus [HIV] disease com...
11   O98713  Human immunodeficiency virus [HIV] disease com...
12   O98719  Human immunodeficiency virus [HIV] disease com...
13    O9872  Human immunodeficiency virus [HIV] disease com...
14    O9873  Human immunodeficiency virus [HIV] disease

In [None]:
# Subject ID that have hiv
df_hiv = run_query(f"""
SELECT `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.subject_id
FROM `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`
WHERE `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string_hiv})
""")
print(df_hiv)

  return pd.io.gbq.read_gbq(


      subject_id
0       10361837
1       10947245
2       11745965
3       12529718
4       13164041
...          ...
7996    19061282
7997    17041034
7998    13806858
7999    16442963
8000    17607781

[8001 rows x 1 columns]


In [None]:
# Add the 'hiv' feature to df
data['hiv'] = data['subject_id'].isin(df_hiv['subject_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv
0        18415616  29138337  2164-04-28T00:00:00        0   50    0
1        19136768  26451475  2151-04-08T00:00:00        1   77    0
2        19136768  27903990  2149-07-22T00:00:00        1   75    0
3        19136768  29901658  2150-12-24T00:00:00        1   76    0
4        10554112  25105282  2154-08-26T00:00:00        1   39    0
...           ...       ...                  ...      ...  ...  ...
29077    10940509  26533493  2201-01-01T00:00:00        0   80    1
29078    19025237  20041142  2171-07-24T00:00:00        1   57    0
29079    14733367  21448520  2148-03-26T00:00:00        1   78    0
29080    16117323  24175963  2191-11-16T00:00:00        1   68    0
29081    13312176  20763054  2136-07-28T00:00:00        0   32    1

[29082 rows x 6 columns]


### **Oxygen**

In [None]:
df = run_query("""
SELECT *
FROM `physionet-data.mimiciv_2_2_hosp.d_labitems`
WHERE LOWER(LABEL) LIKE '%oxygen%'
""")
print(df)

  return pd.io.gbq.read_gbq(


   itemid              label  fluid   category
0   50816             Oxygen  Blood  Blood Gas
1   50817  Oxygen Saturation  Blood  Blood Gas


In [None]:
df_oxygen = run_query("""
SELECT hadm_id, valuenum
FROM `physionet-data.mimiciv_2_2_hosp.labevents`
WHERE itemid = 50816
""")
print(df_oxygen)

  return pd.io.gbq.read_gbq(


        hadm_id  valuenum
0      29867930       NaN
1      20993059      53.0
2      23603937      45.0
3          <NA>      89.0
4      27646159      24.0
...         ...       ...
68846      <NA>     100.0
68847  23600342     100.0
68848      <NA>     100.0
68849      <NA>     100.0
68850  23245156     100.0

[68851 rows x 2 columns]


In [None]:
df_oxygen = df_oxygen.dropna()

In [None]:
data = data.merge(df_oxygen, on='hadm_id', how='left')
data = data.drop_duplicates(subset='hadm_id', keep='first')
data = data.rename(columns={'valuenum': 'oxygen'})

print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN
...           ...       ...                  ...      ...  ...  ...     ...
39340    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN
39341    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN
39342    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN
39343    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN
39344    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN

[29082 rows x 7 columns]


### **Heart rate**

In [None]:
df = run_query("""
SELECT itemid, label
FROM `physionet-data.mimiciv_2_2_icu.d_items`
WHERE LOWER(LABEL) LIKE '%heart%rate%'
""")
print(df)

  return pd.io.gbq.read_gbq(


   itemid                    label
0  220046  Heart rate Alarm - High
1  220047   Heart Rate Alarm - Low
2  220045               Heart Rate


In [None]:
df_heart = run_query("""
SELECT hadm_id, valuenum
FROM `physionet-data.mimiciv_2_2_icu.chartevents`
WHERE itemid = 220045
""")
print(df_heart)

  return pd.io.gbq.read_gbq(


          hadm_id  valuenum
0        22479127     128.0
1        27383823     131.0
2        28515801     132.0
3        21592506     128.0
4        27410896     162.0
...           ...       ...
6460834  23341511     117.0
6460835  28545446     117.0
6460836  28736378     117.0
6460837  28347569     117.0
6460838  23642129     117.0

[6460839 rows x 2 columns]


In [None]:
data = data.merge(df_heart, on='hadm_id', how='left')
data = data.drop_duplicates(subset='hadm_id', keep='first')
data = data.rename(columns={'valuenum': 'heart_rate'})

print(data)

         subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0          18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1          19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2          19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3          19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4          10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...             ...       ...                  ...      ...  ...  ...     ...   
1159133    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
1159134    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
1159162    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
1159163    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
1159164    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

         heart_rate  
0    

### **Temperature**

In [None]:
df = run_query("""
SELECT itemid, label
FROM `physionet-data.mimiciv_2_2_icu.d_items`
WHERE LOWER(LABEL) LIKE '%temperature%'
""")
print(df)

  return pd.io.gbq.read_gbq(


   itemid                        label
0  224027             Skin Temperature
1  228242  Pt. Temperature (BG) (SOFT)
2  224674       Changes in Temperature
3  229236     Cerebral Temperature (C)
4  224642             Temperature Site
5  223761       Temperature Fahrenheit
6  223762          Temperature Celsius
7  226329    Blood Temperature CCO (C)
8  227054        TemperatureF_ApacheIV


In [None]:
df_temperature = run_query("""
SELECT hadm_id, valuenum
FROM `physionet-data.mimiciv_2_2_icu.chartevents`
WHERE itemid = 223761
""")
print(df_temperature)

  return pd.io.gbq.read_gbq(


          hadm_id  valuenum
0        20859990     103.0
1        26772218     100.9
2        25889571     101.1
3        28416715      96.0
4        28588560      96.2
...           ...       ...
1515957  22302536     100.6
1515958  21578343     100.6
1515959  20172393     100.6
1515960  29449120     100.6
1515961  28394491     100.6

[1515962 rows x 2 columns]


In [None]:
data = data.merge(df_temperature, on='hadm_id', how='left')
data = data.drop_duplicates(subset='hadm_id', keep='first')
data = data.rename(columns={'valuenum': 'temperature'})

print(data)

        subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0         18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1         19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2         19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3         19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4         10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...            ...       ...                  ...      ...  ...  ...     ...   
291484    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
291485    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
291492    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
291493    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
291494    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

        heart_rate  temperature  
0    

### **Complete Blood Count**

In [None]:
df_cbc = run_query("""
SELECT hadm_id, hemoglobin, rbc, wbc
FROM `physionet-data.mimiciv_2_2_derived.complete_blood_count`
""")
print(df_cbc)

  return pd.io.gbq.read_gbq(


          hadm_id  hemoglobin   rbc  wbc
0            <NA>        19.6   NaN  NaN
1            <NA>        15.6   NaN  NaN
2        23436329         5.9   NaN  NaN
3        20056386         6.9   NaN  NaN
4            <NA>         6.8   NaN  NaN
...           ...         ...   ...  ...
3362498      <NA>        14.8  4.66  5.6
3362499      <NA>        14.8  4.40  5.8
3362500      <NA>        14.8  4.41  7.7
3362501      <NA>        14.8  4.41  6.4
3362502      <NA>        14.8  4.17  3.9

[3362503 rows x 4 columns]


In [None]:
df_cbc = df_cbc.dropna()

In [None]:
data = data.merge(df_cbc, on='hadm_id', how='left')



In [None]:
# Drop duplicate 'subject_id' rows, keeping only the first occurrence
data = data.drop_duplicates(subset='hadm_id', keep='first')

print(data)

        subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0         18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
2         19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
4         19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
5         19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
8         10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...            ...       ...                  ...      ...  ...  ...     ...   
224519    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
224522    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
224528    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
224531    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
224539    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

        heart_rate  temperature  hemogl

### **Sputum culture**

In [None]:
df = run_query("""
SELECT *
FROM `physionet-data.mimiciv_2_2_icu.d_items`
WHERE LOWER(label) LIKE '%sputum%culture%'
""")
print(df)


  return pd.io.gbq.read_gbq(


   itemid           label    abbreviation          linksto    category  \
0  225451  Sputum Culture  Sputum Culture  procedureevents  6-Cultures   

  unitname param_type  lownormalvalue  highnormalvalue  
0     None  Processes             NaN              NaN  


In [None]:
df_sputum = run_query("""
SELECT t.hadm_id, t.value
FROM `physionet-data.mimiciv_2_2_icu.procedureevents` t
WHERE itemid = 225451
""")

print(df_sputum)

  return pd.io.gbq.read_gbq(


       hadm_id  value
0     24181354    1.0
1     26488315    1.0
2     28128182    1.0
3     22081550    1.0
4     22081550    1.0
...        ...    ...
6641  20208898    1.0
6642  20617667    1.0
6643  20617667    1.0
6644  20277361    1.0
6645  25304202    1.0

[6646 rows x 2 columns]


In [None]:
data = data.merge(df_sputum, on='hadm_id', how='left')

print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin   rbc   

In [None]:
data['sputum_culture'] = data['value']
data = data.drop('value', axis = 1)


In [None]:
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin   rbc   

### **Smoking**

In [None]:
# Diagnosis Codes that indicate smoking-related conditions
# F17.x (nicotine dependence), Z72.0 (tobacco use)
df_smoke = run_query("""
SELECT subject_id
FROM `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`
WHERE ICD_CODE LIKE 'F17%' OR
      ICD_CODE = 'Z72.0%'
      """)

print(df_smoke)



  return pd.io.gbq.read_gbq(


       subject_id
0        10051074
1        10055694
2        10119910
3        10236222
4        10303080
...           ...
17590    19157548
17591    19405755
17592    19410285
17593    19415839
17594    19907150

[17595 rows x 1 columns]


In [None]:
# Add the 'smoking' feature to df
data['smoke'] = data['subject_id'].isin(df_smoke['subject_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin   rbc   

### **Cough**

In [None]:
# Diagnosis Codes that indicate cough
df_cough = run_query("""
SELECT hadm_id
FROM `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`
WHERE ICD_CODE LIKE 'R05%'
      """)

print(df_cough)



  return pd.io.gbq.read_gbq(


       hadm_id
0     24721736
1     25460255
2     26712428
3     22498927
4     23404999
...        ...
1302  27027521
1303  26213492
1304  22878551
1305  25962135
1306  23825504

[1307 rows x 1 columns]


In [None]:
# Add the 'cough' feature to df
data['cough'] = data['hadm_id'].isin(df_cough['hadm_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin   rbc   

In [None]:
data = data.drop_duplicates(subset='hadm_id', keep='first')

print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin   rbc   

#### **4/ Analysis of the data**

In [None]:
data

Unnamed: 0,subject_id,hadm_id,time,diabete,age,hiv,oxygen,heart_rate,temperature,hemoglobin,rbc,wbc,sputum_culture,smoke,cough
0,18415616,29138337,2164-04-28T00:00:00,0,50,0,,,,11.9,4.40,5.5,,0,0
1,19136768,26451475,2151-04-08T00:00:00,1,77,0,,,,12.1,3.96,7.0,,0,0
2,19136768,27903990,2149-07-22T00:00:00,1,75,0,,,,,,,,0,0
3,19136768,29901658,2150-12-24T00:00:00,1,76,0,,,,14.4,4.69,10.9,,0,0
4,10554112,25105282,2154-08-26T00:00:00,1,39,0,,86.0,99.3,8.5,2.73,26.5,,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29417,10940509,26533493,2201-01-01T00:00:00,0,80,1,,,,12.9,3.94,4.2,,0,0
29418,19025237,20041142,2171-07-24T00:00:00,1,57,0,,77.0,98.6,14.6,4.35,4.9,,0,0
29419,14733367,21448520,2148-03-26T00:00:00,1,78,0,,,,9.6,4.47,7.5,,0,0
29420,16117323,24175963,2191-11-16T00:00:00,1,68,0,,,,11.8,3.51,7.9,,0,0


In [None]:
data.isna().sum()

Unnamed: 0,0
subject_id,0
hadm_id,0
time,0
diabete,0
age,0
hiv,0
oxygen,26234
heart_rate,20749
temperature,20801
hemoglobin,2543


In [None]:
data.describe()

Unnamed: 0,subject_id,hadm_id,diabete,age,hiv,oxygen,heart_rate,temperature,hemoglobin,rbc,wbc,sputum_culture,smoke,cough
count,29082.0,29082.0,29082.0,29082.0,29082.0,2848.0,8333.0,8281.0,26539.0,26539.0,26539.0,1058.0,29082.0,29082.0
mean,15016000.0,25038070.0,0.483014,66.698748,0.037893,49.263904,84.231249,98.328379,10.164339,3.446914,8.832911,1.0,0.120177,0.003851
std,2866767.0,2879976.0,0.49972,15.929504,0.19094,22.475153,26.295117,14.544131,2.155202,0.751732,7.8645,0.0,0.325174,0.061939
min,10001180.0,20000060.0,0.0,18.0,0.0,0.0,0.0,0.0,2.3,0.78,0.1,1.0,0.0,0.0
25%,12484310.0,22551000.0,0.0,56.0,0.0,40.0,65.0,97.5,8.5,2.88,5.4,1.0,0.0,0.0
50%,15005520.0,25064720.0,0.0,67.0,0.0,40.0,77.0,97.9,10.0,3.42,7.6,1.0,0.0,0.0
75%,17477300.0,27521140.0,1.0,79.0,0.0,50.0,99.0,98.5,11.7,3.96,10.6,1.0,0.0,0.0
max,19999290.0,29999600.0,1.0,100.0,1.0,100.0,197.0,999.0,19.1,7.46,378.0,1.0,1.0,1.0


#### **5/ Add the diagnoses of each disease**

In [None]:
# Find ICD code related to pneumonia

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_2_2_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%pneumonia%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_pneumonia = run_query(f"""
SELECT `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_2_2_hosp.patients`
JOIN `physionet-data.mimiciv_2_2_hosp.admissions` ON `physionet-data.mimiciv_2_2_hosp.patients`.subject_id = `physionet-data.mimiciv_2_2_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_2_2_hosp.diagnoses_icd` ON `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id = `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_pneumonia)




  return pd.io.gbq.read_gbq(


'00322', '01160', '01161', '01162', '01163', '01164', '01165', '01166', '0382', '0551', '0730', '11505', '11515', '11595', '4800', '4801', '4802', '4803', '4808', '4809', '481', '4820', '4821', '4822', '48230', '48231', '48232', '48239', '48240', '48241', '48242', '48249', '48281', '48282', '48283', '48284', '48289', '4829', '4830', '4831', '4838', '4841', '4843', '4845', '4846', '4847', '4848', '485', '486', '4870', '48801', '48811', '48881', '51630', '51635', '51636', '51637', '5171', '7700', '99731', '99732', 'V0382', 'V066', 'V1261', 'A0103', 'A0222', 'A3700', 'A3701', 'A3710', 'A3711', 'A3780', 'A3781', 'A3790', 'A3791', 'A403', 'A5004', 'A5484', 'B012', 'B052', 'B0681', 'B7781', 'B953', 'B960', 'B961', 'J09X1', 'J100', 'J1000', 'J1001', 'J1008', 'J110', 'J1100', 'J1108', 'J12', 'J120', 'J121', 'J122', 'J123', 'J128', 'J1281', 'J1289', 'J129', 'J13', 'J14', 'J15', 'J150', 'J151', 'J152', 'J1520', 'J1521', 'J15211', 'J15212', 'J1529', 'J153', 'J154', 'J155', 'J156', 'J157', 'J158',

In [None]:
# Add the 'pneumonia' feature to df
data['pneumonia'] = data['hadm_id'].isin(df_pneumonia['hadm_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin   rbc   

In [None]:
# Find ICD code related to tuberculosis

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_2_2_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%tuberculosis%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_tuberculosis = run_query(f"""
SELECT `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_2_2_hosp.patients`
JOIN `physionet-data.mimiciv_2_2_hosp.admissions` ON `physionet-data.mimiciv_2_2_hosp.patients`.subject_id = `physionet-data.mimiciv_2_2_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_2_2_hosp.diagnoses_icd` ON `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id = `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_tuberculosis)




  return pd.io.gbq.read_gbq(


'01005', '01006', '01010', '01011', '01012', '01013', '01014', '01015', '01016', '01080', '01081', '01082', '01083', '01084', '01085', '01086', '01095', '01096', '01100', '01101', '01102', '01103', '01104', '01105', '01106', '01110', '01111', '01112', '01113', '01114', '01115', '01116', '01120', '01121', '01122', '01123', '01124', '01125', '01126', '01130', '01131', '01132', '01133', '01134', '01135', '01136', '01145', '01146', '01155', '01156', '01165', '01166', '01175', '01176', '01180', '01181', '01182', '01183', '01184', '01185', '01186', '01190', '01191', '01192', '01193', '01194', '01195', '01196', '01205', '01206', '01210', '01211', '01212', '01213', '01214', '01215', '01216', '01220', '01221', '01222', '01223', '01224', '01225', '01226', '01235', '01236', '01280', '01281', '01282', '01283', '01284', '01285', '01286', '01305', '01306', '01315', '01316', '01325', '01326', '01335', '01336', '01345', '01346', '01355', '01356', '01365', '01366', '01380', '01381', '01382', '01383', '

In [None]:
# Add the 'tuberculosis' feature to df
data['tuberculosis'] = data['hadm_id'].isin(df_tuberculosis['hadm_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin   rbc   

In [None]:
# Find ICD code related to bronchitis

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_2_2_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%bronchitis%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_bronchitis = run_query(f"""
SELECT `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_2_2_hosp.patients`
JOIN `physionet-data.mimiciv_2_2_hosp.admissions` ON `physionet-data.mimiciv_2_2_hosp.patients`.subject_id = `physionet-data.mimiciv_2_2_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_2_2_hosp.diagnoses_icd` ON `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id = `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_bronchitis)




  return pd.io.gbq.read_gbq(


'4660', '490', '4910', '4911', '49120', '49121', '49122', '4918', '4919', '5060', 'V813', 'J20', 'J200', 'J201', 'J202', 'J203', 'J204', 'J205', 'J206', 'J207', 'J208', 'J209', 'J40', 'J41', 'J410', 'J411', 'J418', 'J42', 'J680'
       hadm_id
0     26192060
1     25319407
2     29674731
3     20894865
4     26904842
...        ...
4772  20664028
4773  29244506
4774  26187239
4775  27534907
4776  23779527

[4777 rows x 1 columns]


In [None]:
# Add the 'bronchitis' feature to df
data['bronchitis'] = data['hadm_id'].isin(df_bronchitis['hadm_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin   rbc   

In [None]:
# Find ICD code related to arthritis

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_2_2_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%arthritis%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_arthritis = run_query(f"""
SELECT `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_2_2_hosp.patients`
JOIN `physionet-data.mimiciv_2_2_hosp.admissions` ON `physionet-data.mimiciv_2_2_hosp.patients`.subject_id = `physionet-data.mimiciv_2_2_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_2_2_hosp.diagnoses_icd` ON `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id = `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_arthritis)




  return pd.io.gbq.read_gbq(


'00323', '05671', '09850', '71100', '71101', '71102', '71103', '71104', '71105', '71106', '71107', '71108', '71109', '71190', '71191', '71192', '71193', '71194', '71195', '71196', '71197', '71198', '71199', '7140', '7142', '71430', '71431', '71432', '71433', '71620', '71621', '71622', '71623', '71624', '71625', '71626', '71627', '71628', '71629', '71630', '71631', '71632', '71633', '71634', '71635', '71636', '71637', '71638', '71639', '71650', '71651', '71652', '71653', '71654', '71655', '71656', '71657', '71658', '71659', '71660', '71661', '71662', '71663', '71664', '71665', '71666', '71667', '71668', 'V134', 'V177', 'V821', 'A0104', 'A0223', 'A1802', 'A3983', 'A3984', 'A5442', 'A6923', 'B0682', 'B2685', 'B4282', 'E7881', 'E790', 'G980', 'L4052', 'M00', 'M000', 'M0000', 'M0001', 'M00011', 'M00012', 'M00019', 'M0002', 'M00021', 'M00022', 'M00029', 'M0003', 'M00031', 'M00032', 'M00039', 'M0004', 'M00041', 'M00042', 'M00049', 'M0005', 'M00051', 'M00052', 'M00059', 'M0006', 'M00061', 'M00

In [None]:
# Add the arthritis feature to df
data['arthritis'] = data['hadm_id'].isin(df_arthritis['hadm_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin   rbc   

In [None]:
# Find ICD code related to fracture

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_2_2_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%fracture%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_fracture = run_query(f"""
SELECT `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_2_2_hosp.patients`
JOIN `physionet-data.mimiciv_2_2_hosp.admissions` ON `physionet-data.mimiciv_2_2_hosp.patients`.subject_id = `physionet-data.mimiciv_2_2_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_2_2_hosp.diagnoses_icd` ON `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id = `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_fracture)




  return pd.io.gbq.read_gbq(


'52563', '52564', '73310', '73311', '73312', '73313', '73314', '73315', '73316', '73319', '73381', '73382', '73393', '73394', '73395', '73396', '73397', '73398', '7672', '80000', '80001', '80002', '80003', '80004', '80005', '80006', '80009', '80010', '80011', '80012', '80013', '80014', '80015', '80016', '80019', '80020', '80021', '80022', '80023', '80024', '80025', '80026', '80029', '80030', '80031', '80032', '80033', '80034', '80035', '80036', '80039', '80040', '80041', '80042', '80043', '80044', '80045', '80046', '80049', '80050', '80051', '80052', '80053', '80054', '80055', '80056', '80059', '80060', '80061', '80062', '80063', '80064', '80065', '80066', '80069', '80070', '80071', '80072', '80073', '80074', '80075', '80076', '80079', '80080', '80081', '80082', '80083', '80084', '80085', '80086', '80089', '80090', '80091', '80092', '80093', '80094', '80095', '80096', '80099', '80100', '80101', '80102', '80103', '80104', '80105', '80106', '80109', '80110', '80111', '80112', '80113', '8

In [None]:
# Add the fracture feature to df
data['fracture'] = data['hadm_id'].isin(df_fracture['hadm_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin   rbc   

In [None]:
# Find ICD code related to cancer

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_2_2_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%malignant%neoplasm%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_lung_cancer = run_query(f"""
SELECT `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_2_2_hosp.patients`
JOIN `physionet-data.mimiciv_2_2_hosp.admissions` ON `physionet-data.mimiciv_2_2_hosp.patients`.subject_id = `physionet-data.mimiciv_2_2_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_2_2_hosp.diagnoses_icd` ON `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id = `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_lung_cancer)




  return pd.io.gbq.read_gbq(


'1400', '1401', '1403', '1404', '1405', '1406', '1408', '1409', '1410', '1411', '1412', '1413', '1414', '1415', '1416', '1418', '1419', '1420', '1421', '1422', '1428', '1429', '1430', '1431', '1438', '1439', '1440', '1441', '1448', '1449', '1450', '1451', '1452', '1453', '1454', '1455', '1456', '1458', '1459', '1460', '1461', '1462', '1463', '1464', '1465', '1466', '1467', '1468', '1469', '1470', '1471', '1472', '1473', '1478', '1479', '1480', '1481', '1482', '1483', '1488', '1489', '1490', '1491', '1498', '1499', '1500', '1501', '1502', '1503', '1504', '1505', '1508', '1509', '1510', '1511', '1512', '1513', '1514', '1515', '1516', '1518', '1519', '1520', '1521', '1522', '1523', '1528', '1529', '1530', '1531', '1532', '1533', '1534', '1535', '1536', '1537', '1538', '1539', '1540', '1541', '1542', '1543', '1548', '1550', '1551', '1552', '1560', '1561', '1562', '1568', '1569', '1570', '1571', '1572', '1573', '1574', '1578', '1579', '1580', '1588', '1589', '1590', '1591', '1598', '1599', 

In [None]:
# Add the cancer feature to df
data['lung_cancer'] = data['hadm_id'].isin(df_lung_cancer['hadm_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin  ...   w

In [None]:
# Find ICD code related to scoliosis

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_2_2_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%scoliosis%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_scoliosis = run_query(f"""
SELECT `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_2_2_hosp.patients`
JOIN `physionet-data.mimiciv_2_2_hosp.admissions` ON `physionet-data.mimiciv_2_2_hosp.patients`.subject_id = `physionet-data.mimiciv_2_2_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_2_2_hosp.diagnoses_icd` ON `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id = `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_scoliosis)




  return pd.io.gbq.read_gbq(


'73730', '73731', '73732', '73733', '73734', '73739', '73743', 'M41', 'M410', 'M4100', 'M4102', 'M4103', 'M4104', 'M4105', 'M4106', 'M4107', 'M4108', 'M411', 'M4111', 'M41112', 'M41113', 'M41114', 'M41115', 'M41116', 'M41117', 'M41119', 'M4112', 'M41122', 'M41123', 'M41124', 'M41125', 'M41126', 'M41127', 'M41129', 'M412', 'M4120', 'M4122', 'M4123', 'M4124', 'M4125', 'M4126', 'M4127', 'M413', 'M4130', 'M4134', 'M4135', 'M414', 'M4140', 'M4141', 'M4142', 'M4143', 'M4144', 'M4145', 'M4146', 'M4147', 'M415', 'M4150', 'M4152', 'M4153', 'M4154', 'M4155', 'M4156', 'M4157', 'M418', 'M4180', 'M4182', 'M4183', 'M4184', 'M4185', 'M4186', 'M4187', 'M419', 'M965', 'Q763', 'Q764', 'Q7649', 'Z4782'
       hadm_id
0     22416024
1     25140535
2     21171327
3     20369629
4     29095617
...        ...
1711  20414159
1712  24479213
1713  20124156
1714  24974319
1715  27675917

[1716 rows x 1 columns]


In [None]:
# Add the scoliosis feature to df
data['scoliosis'] = data['hadm_id'].isin(df_scoliosis['hadm_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin  ...  sp

In [None]:
# Find ICD code related to infection

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_2_2_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%infection"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_lung_infection = run_query(f"""
SELECT `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_2_2_hosp.patients`
JOIN `physionet-data.mimiciv_2_2_hosp.admissions` ON `physionet-data.mimiciv_2_2_hosp.patients`.subject_id = `physionet-data.mimiciv_2_2_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_2_2_hosp.diagnoses_icd` ON `physionet-data.mimiciv_2_2_hosp.admissions`.hadm_id = `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_2_2_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_lung_infection)




  return pd.io.gbq.read_gbq(


'0271', '0390', '0391', '0392', '0393', '05881', '05882', '05889', '07981', '07988', '07989', '07998', '07999', '1232', '1238', '1255', '28804', '326', '59800', '6380', '7711', '7717', '9100', '9102', '9104', '9106', '9108', '9110', '9112', '9114', '9116', '9118', '9120', '9122', '9124', '9126', '9128', '9130', '9132', '9134', '9136', '9138', '9140', '9142', '9144', '9146', '9148', '9150', '9152', '9154', '9156', '9158', '9160', '9162', '9164', '9166', '9168', '9170', '9172', '9174', '9176', '9178', '9190', '9192', '9194', '9196', '9198', '99859', 'V1302', 'A0229', 'A040', 'A041', 'A042', 'A043', 'A310', 'A311', 'A39', 'A54', 'A542', 'A5439', 'A5483', 'A5619', 'A923', 'B0089', 'B108', 'B1081', 'B1082', 'B1089', 'D703', 'H6132', 'H61321', 'H61322', 'J22', 'J440', 'J470', 'K8500', 'K8510', 'K8520', 'K8530', 'K8580', 'K9402', 'K9412', 'K9422', 'K9432', 'N46022', 'N46122', 'N99511', 'P351', 'P352', 'P393', 'P394', 'P582', 'R65', 'T8603', 'T8613', 'T8623', 'T8633', 'T8643', 'T86812', 'T8682

In [None]:
# Add the infection feature to df
data['lung_infection'] = data['hadm_id'].isin(df_lung_infection['hadm_id']).astype(int)
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin  ...  sm

In [None]:
# Drop duplicate 'hadm_id' rows, keeping only the first occurrence
data = data.drop_duplicates(subset='hadm_id', keep='first')

print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin  ...  sm

#### **6/ Analysis of the data**

In [None]:
data.isna().sum()

Unnamed: 0,0
subject_id,0
hadm_id,0
time,0
diabete,0
age,0
hiv,0
oxygen,26234
heart_rate,20749
temperature,20801
hemoglobin,2543


In [None]:
data.describe()

Unnamed: 0,subject_id,hadm_id,diabete,age,hiv,oxygen,heart_rate,temperature,hemoglobin,rbc,...,smoke,cough,pneumonia,tuberculosis,bronchitis,arthritis,fracture,lung_cancer,scoliosis,lung_infection
count,29082.0,29082.0,29082.0,29082.0,29082.0,2848.0,8333.0,8281.0,26539.0,26539.0,...,29082.0,29082.0,29082.0,29082.0,29082.0,29082.0,29082.0,29082.0,29082.0,29082.0
mean,15016000.0,25038070.0,0.483014,66.698748,0.037893,49.263904,84.231249,98.328379,10.164339,3.446914,...,0.120177,0.003851,0.308404,0.024311,0.057389,0.041091,0.072657,0.268379,0.004505,0.03779
std,2866767.0,2879976.0,0.49972,15.929504,0.19094,22.475153,26.295117,14.544131,2.155202,0.751732,...,0.325174,0.061939,0.461842,0.154014,0.232589,0.198503,0.259576,0.443124,0.066965,0.190691
min,10001180.0,20000060.0,0.0,18.0,0.0,0.0,0.0,0.0,2.3,0.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,12484310.0,22551000.0,0.0,56.0,0.0,40.0,65.0,97.5,8.5,2.88,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,15005520.0,25064720.0,0.0,67.0,0.0,40.0,77.0,97.9,10.0,3.42,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,17477300.0,27521140.0,1.0,79.0,0.0,50.0,99.0,98.5,11.7,3.96,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
max,19999290.0,29999600.0,1.0,100.0,1.0,100.0,197.0,999.0,19.1,7.46,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
non_zero_counts = data.apply(lambda x: (x != 0).sum())
print(non_zero_counts)

subject_id        29082
hadm_id           29082
time              29082
diabete           14047
age               29082
hiv                1102
oxygen            29080
heart_rate        29067
temperature       29074
hemoglobin        29082
rbc               29082
wbc               29082
sputum_culture    29082
smoke              3495
cough               112
pneumonia          8969
tuberculosis        707
bronchitis         1669
arthritis          1195
fracture           2113
lung_cancer        7805
scoliosis           131
lung_infection     1099
dtype: int64


In [None]:
data['sum'] = data['pneumonia'] + data['tuberculosis'] + data['bronchitis'] + data['arthritis'] + data['fracture'] + data['lung_cancer'] + data['scoliosis'] + data['lung_infection']


In [None]:
print(data)

       subject_id   hadm_id                 time  diabete  age  hiv  oxygen  \
0        18415616  29138337  2164-04-28T00:00:00        0   50    0     NaN   
1        19136768  26451475  2151-04-08T00:00:00        1   77    0     NaN   
2        19136768  27903990  2149-07-22T00:00:00        1   75    0     NaN   
3        19136768  29901658  2150-12-24T00:00:00        1   76    0     NaN   
4        10554112  25105282  2154-08-26T00:00:00        1   39    0     NaN   
...           ...       ...                  ...      ...  ...  ...     ...   
29417    10940509  26533493  2201-01-01T00:00:00        0   80    1     NaN   
29418    19025237  20041142  2171-07-24T00:00:00        1   57    0     NaN   
29419    14733367  21448520  2148-03-26T00:00:00        1   78    0     NaN   
29420    16117323  24175963  2191-11-16T00:00:00        1   68    0     NaN   
29421    13312176  20763054  2136-07-28T00:00:00        0   32    1     NaN   

       heart_rate  temperature  hemoglobin  ...  co

In [None]:
data['sum'].value_counts()

Unnamed: 0_level_0,count
sum,Unnamed: 1_level_1
1,11892
0,11741
2,4624
3,755
4,67
5,3


Some patients have multiple diseases at the same time

In [None]:
data.to_csv('other_feature_data.csv', index=False)

#### **7/ Merge this additional features with the x-ray data**

In [None]:
columns_to_drop = ['time', 'hadm_id']

# Drop the specified columns
data = data.drop(columns=columns_to_drop)
data

Unnamed: 0,subject_id,diabete,age,hiv,oxygen,heart_rate,temperature,hemoglobin,rbc,wbc,...,cough,pneumonia,tuberculosis,bronchitis,arthritis,fracture,lung_cancer,scoliosis,lung_infection,sum
0,18415616,0,50,0,,,,11.9,4.40,5.5,...,0,0,0,0,0,0,0,0,0,0
1,19136768,1,77,0,,,,12.1,3.96,7.0,...,0,0,0,0,0,1,0,0,0,1
2,19136768,1,75,0,,,,,,,...,0,0,0,0,0,0,0,0,0,0
3,19136768,1,76,0,,,,14.4,4.69,10.9,...,0,0,0,1,0,0,0,0,1,2
4,10554112,1,39,0,,86.0,99.3,8.5,2.73,26.5,...,0,0,0,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29417,10940509,0,80,1,,,,12.9,3.94,4.2,...,0,0,1,0,0,1,0,0,0,2
29418,19025237,1,57,0,,77.0,98.6,14.6,4.35,4.9,...,0,1,0,0,0,0,0,0,0,1
29419,14733367,1,78,0,,,,9.6,4.47,7.5,...,0,1,0,0,0,0,0,0,0,1
29420,16117323,1,68,0,,,,11.8,3.51,7.9,...,0,1,0,0,0,0,0,0,0,1


In [None]:
# Merge the two DataFrames based on 'subject_id'
data_f = data.merge(data_x_ray, on='subject_id', how='left')

# Drop duplicate 'subject_id' rows, keeping only the last age occurrence
data_f = data_f.drop_duplicates(subset='hadm_id', keep='last')

print(data_f)

        subject_id  diabete  age  hiv  oxygen  heart_rate  temperature  \
0         18415616        0   50    0     NaN         NaN          NaN   
7         19136768        1   76    0     NaN         NaN          NaN   
8         19136768        1   76    0     NaN         NaN          NaN   
9         19136768        1   76    0     NaN         NaN          NaN   
28        17835008        0   56    1     NaN         NaN          NaN   
...            ...      ...  ...  ...     ...         ...          ...   
169253    13312176        0   32    1     NaN         NaN          NaN   
169254    13312176        0   32    1     NaN         NaN          NaN   
169255    13312176        0   32    1     NaN         NaN          NaN   
169256    13312176        0   32    1     NaN         NaN          NaN   
169257    13312176        0   32    1     NaN         NaN          NaN   

        hemoglobin   rbc   wbc  ...        14        15        16        17  \
0             11.9  4.40   5.5  

In [None]:
data_f.to_csv('x_ray_and_features_data.csv', index=False)

NameError: name 'data_f' is not defined

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!cp /content/x_ray_and_features_data.csv /content/drive/MyDrive/doan_demo/


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
