### Bibliotecas

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import gc
from tqdm import tqdm

In [2]:
itemids = [50817,51301,51144,51288,50889,50861,53084,50878,53088,50885,53089,51842,52024,51237,51675,51240,51002,51003,52642,50809,50931,52569,50983,52623,50811,51222,51640,50902,52535,50882,50813,52442,53154,52022,53138,50862,53085,220045,220210,227242,227243,223762,53134,51300,51704,52159,52647,51006,50825]

In [3]:
BASES_HOSP = "/scratch/haniel.botelho/physionet.org/files/mimiciv/2.2/hosp/"
BASES_ICU = "/scratch/haniel.botelho/physionet.org/files/mimiciv/2.2/icu/"
patient = "patients.csv"
admission = "admissions.csv"
diagnostics = "diagnoses_icd.csv"
icd_diagnostics = "d_icd_diagnoses.csv"
lab = "labevents.csv"
lab_items = "d_labitems.csv"
chart_items = "d_items.csv"
chart = "chartevents.csv"
icu = "icustays.csv"
id_dignoses = "d_icd_diagnoses.csv"

url_patient = BASES_HOSP + patient
url_admission = BASES_HOSP + admission
url_diagnostics = BASES_HOSP + diagnostics
url_icd_diagnostics = BASES_HOSP + icd_diagnostics
url_lab = BASES_HOSP + lab
url_lab_items = BASES_HOSP + lab_items
url_chart_items = BASES_HOSP + chart_items
url_chart = BASES_ICU + chart
url_icu = BASES_ICU + icu
url_id_dignoses = BASES_HOSP + id_dignoses

In [4]:
def aggregate_events(df):
    itemid_map = {
        'Albumine': [52022, 53138, 50862, 53085],
        'Lactate': [50813, 52442, 53154],
        'Bicarbonate': [50813, 52442, 53154],
        'Chloride': [50902, 52535],
        'Hemoglobin': [50811, 51222, 51640],
        'Sodium': [50983, 52623],
        'Glucose': [50809, 50931, 52569],
        'Troponin': [51002, 51003, 52642],
        'Platelets': [51240,51704,52159],
        'INR': [51237, 51675],
        'Creatinine': [50813, 52442, 53154],
        'Bun': [51842,52647,51006],
        'Bilirubin': [50885, 53089],
        'AST': [50878, 53088],
        'ALT': [50861, 53084],
        'C_reactive': [50889],
        'ESR': [51288],
        'Bands': [51144],
        'WBC': [51301,53134,51300],
        'Oxygen_saturation': [50817],
        'Temperature': [223762],
        'Systolic_pressure': [227242, 227243],
        'Respiratory_rate': [220210],
        'Heart_rate': [220045]
    }
    
    for key, itemids in itemid_map.items():
        new_col = df['itemid'].isin(itemids) * df["value"]
        new_col[new_col == ""] = np.NaN
        new_col[new_col == "___"] = np.NaN
        df[key] = new_col
    return df


### Base de dados

#### Exames

In [5]:
# Define the chunk size
chunk_size = 10_000_000  # Adjust based on your memory capacity

# Initialize an empty list to hold the filtered DataFrames
filtered_chunks = []

In [6]:
# Iterate over the CSV file in chunks with a progress bar
for chunk in tqdm(pd.read_csv(url_chart, chunksize=chunk_size)):
    # Filter the chunk
    filtered_chunk = chunk[chunk['itemid'].isin(itemids)][['subject_id', 'hadm_id', 'charttime', 'itemid', 'value']]
    # Append the filtered chunk to the list
    filtered_chunks.append(filtered_chunk)
    # Free up memory
    del chunk
    gc.collect()

32it [04:20,  8.13s/it]


In [7]:
df_selected_chartvents = pd.concat(filtered_chunks, ignore_index=True)
del filtered_chunks
gc.collect()
df_selected_chartvents

Unnamed: 0,subject_id,hadm_id,charttime,itemid,value
0,10000032,29079034,2180-07-23 22:00:00,220045,94
1,10000032,29079034,2180-07-23 22:00:00,220210,20
2,10000032,29079034,2180-07-23 19:00:00,220045,97
3,10000032,29079034,2180-07-23 19:00:00,220210,16
4,10000032,29079034,2180-07-23 20:00:00,220045,100
...,...,...,...,...,...
13121402,19999987,23865745,2145-11-04 19:00:00,220210,20
13121403,19999987,23865745,2145-11-04 20:00:00,220045,92
13121404,19999987,23865745,2145-11-04 20:00:00,220210,23
13121405,19999987,23865745,2145-11-04 21:00:00,220045,87


In [8]:
filtered_chunks = []
for chunk in tqdm(pd.read_csv(url_lab, chunksize=chunk_size)):
    # Filter the chunk
    filtered_chunk = chunk[chunk['itemid'].isin(itemids)][['subject_id', 'hadm_id', 'charttime', 'itemid', 'value']]
    # Append the filtered chunk to the list
    filtered_chunks.append(filtered_chunk)
    # Free up memory
    del chunk
    gc.collect()

12it [02:06, 10.52s/it]


In [9]:
df_selected_labevents = pd.concat(filtered_chunks, ignore_index=True)
del filtered_chunks
gc.collect()
df_selected_labevents

Unnamed: 0,subject_id,hadm_id,charttime,itemid,value
0,10000032,,2180-03-23 11:51:00,51237,1.4
1,10000032,,2180-03-23 11:51:00,50861,102
2,10000032,,2180-03-23 11:51:00,50862,3.3
3,10000032,,2180-03-23 11:51:00,50878,143
4,10000032,,2180-03-23 11:51:00,50882,27
...,...,...,...,...,...
29370921,19999987,23865745.0,2145-11-09 05:30:00,50983,144
29370922,19999987,23865745.0,2145-11-09 05:30:00,51006,8
29370923,19999987,23865745.0,2145-11-09 05:30:00,51222,12.3
29370924,19999987,23865745.0,2145-11-09 05:30:00,51301,5.7


In [10]:
df_exams = pd.concat([df_selected_chartvents,df_selected_labevents],ignore_index=True)

In [11]:
del df_selected_chartvents
del df_selected_labevents
gc.collect()

0

In [13]:
df_exams.to_csv('exams.csv',index = False)

#### Diagnósticos

In [14]:
df_diagnostics = pd.read_csv(url_diagnostics)
df_id = pd.read_csv(url_id_dignoses)
df_diagnostics = pd.merge(df_diagnostics, df_id,how = "inner",on = ["icd_code","icd_version"])
del df_id
gc.collect()

299

In [15]:
df_diagnostics['sepsis'] = df_diagnostics['long_title'].str.contains("sepsis") | df_diagnostics['long_title'].str.contains("Sepsis")


In [16]:
df_diagnostics.to_csv('diagnostics.csv',index = False)