<a href="https://colab.research.google.com/github/tronglinux123/AI-Project/blob/main/deep_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-cloud-bigquery



In [None]:
from datetime import timedelta
import os

import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from IPython.display import display, HTML, Image
%matplotlib inline

plt.style.use('ggplot')
plt.rcParams.update({'font.size': 20})

# Access data using Google BigQuery.
from google.colab import auth
from google.cloud import bigquery

In [None]:
auth.authenticate_user()

In [None]:
# Set up environment variables
project_id = 'project-physionet-474410'
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Read data from BigQuery into pandas dataframes.
def run_query(query, project_id=project_id):
  return pd.io.gbq.read_gbq(
      query,
      project_id=project_id,
      dialect='standard')

# set the dataset
dataset = 'mimiciv'


In [None]:
data_x_ray = pd.read_csv('x_ray_data.csv')
data = data_x_ray[['subject_id', 'hadm_id', 'time']]
data = data.drop(data.index[-1])
data

FileNotFoundError: [Errno 2] No such file or directory: 'x_ray_data.csv'

### **Diabete**

In [None]:
# Find ICD code related to diabete
df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%diabete%"
""")
print(df)

# ICD code list
icd_codes_diabete = df['icd_code'].to_list()
icd_codes_string_diabete = ', '.join(["'" + icd + "'" for icd in icd_codes_diabete])
print(icd_codes_string_diabete)


In [None]:
# Subject ID that have diabete
df_diabete = run_query(f"""
SELECT `physionet-data.mimiciv_hosp.diagnoses_icd`.subject_id, `physionet-data.mimiciv_hosp.diagnoses_icd`.hadm_id
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE `physionet-data.mimiciv_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string_diabete})
""")
print(df_diabete)

Then need to create a binary variable with 1 if patient is in this list, 0 otherwise

In [None]:
# Add the 'diabete' feature to df
data['diabete'] = data['subject_id'].isin(df_diabete['subject_id']).astype(int)
print(data)

### **Age**

In [None]:
df_age = run_query("""
SELECT a.age, a.hadm_id
FROM `physionet-data.mimiciv_derived.age` a
""")
print(df_age)


In [None]:
# Merge the two DataFrames based on 'subject_id'
data = data.merge(df_age, on='hadm_id', how='left')

# Drop duplicate 'subject_id' rows, keeping only the last age occurrence
data = data.drop_duplicates(subset='hadm_id', keep='last')

print(data)

### **HIV**

In [None]:
# Find ICD code related to HIV
df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%hiv%"
""")
print(df)

# ICD code list
icd_codes_hiv = df['icd_code'].to_list()
icd_codes_string_hiv = ', '.join(["'" + icd + "'" for icd in icd_codes_hiv])
print(icd_codes_string_hiv)


In [None]:
# Subject ID that have hiv
df_hiv = run_query(f"""
SELECT `physionet-data.mimiciv_hosp.diagnoses_icd`.subject_id
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE `physionet-data.mimiciv_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string_hiv})
""")
print(df_hiv)

Then need to create a binary variable with 1 if patient is in this list, 0 otherwise

In [None]:
# Add the 'hiv' feature to df
data['hiv'] = data['subject_id'].isin(df_hiv['subject_id']).astype(int)
print(data)

### **Oxygen**

In [None]:
df = run_query("""
SELECT *
FROM `physionet-data.mimiciv_hosp.d_labitems`
WHERE LOWER(LABEL) LIKE '%oxygen%'
""")
print(df)

In [None]:
df_oxygen = run_query("""
SELECT hadm_id, valuenum
FROM `physionet-data.mimiciv_hosp.labevents`
WHERE itemid = 50816
""")
print(df_oxygen)

In [None]:
df_oxygen = df_oxygen.dropna()

In [None]:
data = data.merge(df_oxygen, on='hadm_id', how='left')
data = data.drop_duplicates(subset='hadm_id', keep='first')
data = data.rename(columns={'valuenum': 'oxygen'})

print(data)

### **Heart rate**

In [None]:
df = run_query("""
SELECT itemid, label
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(LABEL) LIKE '%heart%rate%'
""")
print(df)

In [None]:
df_heart = run_query("""
SELECT hadm_id, valuenum
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE itemid = 220045
""")
print(df_heart)

In [None]:
data = data.merge(df_heart, on='hadm_id', how='left')
data = data.drop_duplicates(subset='hadm_id', keep='first')
data = data.rename(columns={'valuenum': 'heart_rate'})

print(data)

### **Temperature**

In [None]:
df = run_query("""
SELECT itemid, label
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(LABEL) LIKE '%temperature%'
""")
print(df)

In [None]:
df_temperature = run_query("""
SELECT hadm_id, valuenum
FROM `physionet-data.mimiciv_icu.chartevents`
WHERE itemid = 223761
""")
print(df_temperature)

In [None]:
data = data.merge(df_temperature, on='hadm_id', how='left')
data = data.drop_duplicates(subset='hadm_id', keep='first')
data = data.rename(columns={'valuenum': 'temperature'})

print(data)

### **Complete Blood Count**

In [None]:
df_cbc = run_query("""
SELECT hadm_id, hemoglobin, rbc, wbc
FROM `physionet-data.mimiciv_derived.complete_blood_count`
""")
print(df_cbc)

In [None]:
df_cbc = df_cbc.dropna()

In [None]:
data = data.merge(df_cbc, on='hadm_id', how='left')



In [None]:
# Drop duplicate 'subject_id' rows, keeping only the first occurrence
data = data.drop_duplicates(subset='hadm_id', keep='first')

print(data)

### **Sputum culture**

In [None]:
df = run_query("""
SELECT *
FROM `physionet-data.mimiciv_icu.d_items`
WHERE LOWER(label) LIKE '%sputum%culture%'
""")
print(df)


In [None]:
df_sputum = run_query("""
SELECT t.hadm_id, t.value
FROM `physionet-data.mimiciv_icu.procedureevents` t
WHERE itemid = 225451
""")

print(df_sputum)

In [None]:
data = data.merge(df_sputum, on='hadm_id', how='left')

print(data)

In [None]:
data['sputum_culture'] = data['value']
data = data.drop('value', axis = 1)


In [None]:
print(data)

### **Smoking**

In [None]:
# Diagnosis Codes that indicate smoking-related conditions
# F17.x (nicotine dependence), Z72.0 (tobacco use)
df_smoke = run_query("""
SELECT subject_id
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE ICD_CODE LIKE 'F17%' OR
      ICD_CODE = 'Z72.0%'
      """)

print(df_smoke)



In [None]:
# Add the 'smoking' feature to df
data['smoke'] = data['subject_id'].isin(df_smoke['subject_id']).astype(int)
print(data)

### **Cough**

In [None]:
# Diagnosis Codes that indicate cough
df_cough = run_query("""
SELECT hadm_id
FROM `physionet-data.mimiciv_hosp.diagnoses_icd`
WHERE ICD_CODE LIKE 'R05%'
      """)

print(df_cough)



In [None]:
# Add the 'cough' feature to df
data['cough'] = data['hadm_id'].isin(df_cough['hadm_id']).astype(int)
print(data)

In [None]:
data = data.drop_duplicates(subset='hadm_id', keep='first')

print(data)

#### **4/ Analysis of the data**

In [None]:
data

In [None]:
data.isna().sum()

In [None]:
data.describe()

#### **5/ Add the diagnoses of each disease**

In [None]:
# Find ICD code related to pneumonia

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%pneumonia%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_pneumonia = run_query(f"""
SELECT `physionet-data.mimiciv_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_hosp.patients`
JOIN `physionet-data.mimiciv_hosp.admissions` ON `physionet-data.mimiciv_hosp.patients`.subject_id = `physionet-data.mimiciv_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_hosp.diagnoses_icd` ON `physionet-data.mimiciv_hosp.admissions`.hadm_id = `physionet-data.mimiciv_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_pneumonia)




In [None]:
# Add the 'pneumonia' feature to df
data['pneumonia'] = data['hadm_id'].isin(df_pneumonia['hadm_id']).astype(int)
print(data)

In [None]:
# Find ICD code related to tuberculosis

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%tuberculosis%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_tuberculosis = run_query(f"""
SELECT `physionet-data.mimiciv_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_hosp.patients`
JOIN `physionet-data.mimiciv_hosp.admissions` ON `physionet-data.mimiciv_hosp.patients`.subject_id = `physionet-data.mimiciv_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_hosp.diagnoses_icd` ON `physionet-data.mimiciv_hosp.admissions`.hadm_id = `physionet-data.mimiciv_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_tuberculosis)




In [None]:
# Add the 'tuberculosis' feature to df
data['tuberculosis'] = data['hadm_id'].isin(df_tuberculosis['hadm_id']).astype(int)
print(data)

In [None]:
# Find ICD code related to bronchitis

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%bronchitis%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_bronchitis = run_query(f"""
SELECT `physionet-data.mimiciv_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_hosp.patients`
JOIN `physionet-data.mimiciv_hosp.admissions` ON `physionet-data.mimiciv_hosp.patients`.subject_id = `physionet-data.mimiciv_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_hosp.diagnoses_icd` ON `physionet-data.mimiciv_hosp.admissions`.hadm_id = `physionet-data.mimiciv_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_bronchitis)




In [None]:
# Add the 'bronchitis' feature to df
data['bronchitis'] = data['hadm_id'].isin(df_bronchitis['hadm_id']).astype(int)
print(data)

In [None]:
# Find ICD code related to arthritis

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%arthritis%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_arthritis = run_query(f"""
SELECT `physionet-data.mimiciv_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_hosp.patients`
JOIN `physionet-data.mimiciv_hosp.admissions` ON `physionet-data.mimiciv_hosp.patients`.subject_id = `physionet-data.mimiciv_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_hosp.diagnoses_icd` ON `physionet-data.mimiciv_hosp.admissions`.hadm_id = `physionet-data.mimiciv_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_arthritis)




In [None]:
# Add the arthritis feature to df
data['arthritis'] = data['hadm_id'].isin(df_arthritis['hadm_id']).astype(int)
print(data)

In [None]:
# Find ICD code related to fracture

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%fracture%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_fracture = run_query(f"""
SELECT `physionet-data.mimiciv_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_hosp.patients`
JOIN `physionet-data.mimiciv_hosp.admissions` ON `physionet-data.mimiciv_hosp.patients`.subject_id = `physionet-data.mimiciv_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_hosp.diagnoses_icd` ON `physionet-data.mimiciv_hosp.admissions`.hadm_id = `physionet-data.mimiciv_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_fracture)




In [None]:
# Add the fracture feature to df
data['fracture'] = data['hadm_id'].isin(df_fracture['hadm_id']).astype(int)
print(data)

In [None]:
# Find ICD code related to cancer

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%malignant%neoplasm%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_lung_cancer = run_query(f"""
SELECT `physionet-data.mimiciv_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_hosp.patients`
JOIN `physionet-data.mimiciv_hosp.admissions` ON `physionet-data.mimiciv_hosp.patients`.subject_id = `physionet-data.mimiciv_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_hosp.diagnoses_icd` ON `physionet-data.mimiciv_hosp.admissions`.hadm_id = `physionet-data.mimiciv_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_lung_cancer)




In [None]:
# Add the cancer feature to df
data['lung_cancer'] = data['hadm_id'].isin(df_lung_cancer['hadm_id']).astype(int)
print(data)

In [None]:
# Find ICD code related to scoliosis

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%scoliosis%"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_scoliosis = run_query(f"""
SELECT `physionet-data.mimiciv_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_hosp.patients`
JOIN `physionet-data.mimiciv_hosp.admissions` ON `physionet-data.mimiciv_hosp.patients`.subject_id = `physionet-data.mimiciv_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_hosp.diagnoses_icd` ON `physionet-data.mimiciv_hosp.admissions`.hadm_id = `physionet-data.mimiciv_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_scoliosis)




In [None]:
# Add the scoliosis feature to df
data['scoliosis'] = data['hadm_id'].isin(df_scoliosis['hadm_id']).astype(int)
print(data)

In [None]:
# Find ICD code related to infection

df = run_query("""
SELECT icd_code, long_title FROM `physionet-data.mimiciv_hosp.d_icd_diagnoses`
WHERE LOWER(long_title) LIKE "%infection"
""")

# ICD code list
icd_codes = df['icd_code'].to_list()
icd_codes_string = ', '.join(["'" + icd + "'" for icd in icd_codes])
print(icd_codes_string)

# Subject ID that have an ICD code related to pneumonia, tuberculosis and bronchitis and a CHEST x-ray

df_lung_infection = run_query(f"""
SELECT `physionet-data.mimiciv_hosp.admissions`.hadm_id
FROM `physionet-data.mimiciv_hosp.patients`
JOIN `physionet-data.mimiciv_hosp.admissions` ON `physionet-data.mimiciv_hosp.patients`.subject_id = `physionet-data.mimiciv_hosp.admissions`.subject_id
JOIN `physionet-data.mimiciv_hosp.diagnoses_icd` ON `physionet-data.mimiciv_hosp.admissions`.hadm_id = `physionet-data.mimiciv_hosp.diagnoses_icd`.hadm_id
WHERE `physionet-data.mimiciv_hosp.diagnoses_icd`.icd_code IN ({icd_codes_string})
""")
print(df_lung_infection)




In [None]:
# Add the infection feature to df
data['lung_infection'] = data['hadm_id'].isin(df_lung_infection['hadm_id']).astype(int)
print(data)

In [None]:
# Drop duplicate 'hadm_id' rows, keeping only the first occurrence
data = data.drop_duplicates(subset='hadm_id', keep='first')

print(data)

#### **6/ Analysis of the data**

In [None]:
data.isna().sum()

In [None]:
data.describe()

In [None]:
non_zero_counts = data.apply(lambda x: (x != 0).sum())
print(non_zero_counts)

In [None]:
data['sum'] = data['pneumonia'] + data['tuberculosis'] + data['bronchitis'] + data['arthritis'] + data['fracture'] + data['lung_cancer'] + data['scoliosis'] + data['lung_infection']


In [None]:
print(data)

In [None]:
data['sum'].value_counts()

Some patients have multiple diseases at the same time

In [None]:
data.to_csv('other_feature_data.csv', index=False)

#### **7/ Merge this additional features with the x-ray data**

In [None]:
columns_to_drop = ['time', 'hadm_id']

# Drop the specified columns
data = data.drop(columns=columns_to_drop)
data

In [None]:
# Merge the two DataFrames based on 'subject_id'
data_f = data.merge(data_x_ray, on='subject_id', how='left')

# Drop duplicate 'subject_id' rows, keeping only the last age occurrence
data_f = data_f.drop_duplicates(subset='hadm_id', keep='last')

print(data_f)

In [None]:
data_f.to_csv('x_ray_and_features_data.csv', index=False)