In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
DATA_PATH = "../data"   # relative to notebooks folder
files = os.listdir(DATA_PATH)
print("Available files:", files)

Available files: ['patients.csv.gz', 'transfers.csv.gz', 'admissions.csv.gz', 'procedures_icd.csv.gz', 'diagnoses_icd.csv.gz']


In [3]:
def load_csv_gz(filename, usecols=None, nrows=None):
    path = os.path.join(DATA_PATH, filename)
    df = pd.read_csv(
        path,
        compression='gzip',
        low_memory=False,
        usecols=usecols,
        nrows=nrows
    )
    print(f"{filename} loaded: {df.shape}")
    return df

In [4]:
admissions = load_csv_gz("admissions.csv.gz")
patients = load_csv_gz("patients.csv.gz")
diagnoses = load_csv_gz("diagnoses_icd.csv.gz")
procedures = load_csv_gz("procedures_icd.csv.gz")
transfers = load_csv_gz("transfers.csv.gz")

admissions.csv.gz loaded: (546028, 16)
patients.csv.gz loaded: (364627, 6)
diagnoses_icd.csv.gz loaded: (6364488, 5)
procedures_icd.csv.gz loaded: (859655, 6)
transfers.csv.gz loaded: (2413581, 7)


In [5]:
for name, df in {
    "admissions": admissions,
    "patients": patients,
    "diagnoses": diagnoses,
    "procedures": procedures,
    "transfers": transfers
}.items():
    print(f"\n{name.upper()}")
    print(df.head(2))
    print(df.info(memory_usage='deep'))


ADMISSIONS
   subject_id   hadm_id            admittime            dischtime deathtime  \
0    10000032  22595853  2180-05-06 22:23:00  2180-05-07 17:15:00       NaN   
1    10000032  22841357  2180-06-26 18:27:00  2180-06-27 18:49:00       NaN   

  admission_type admit_provider_id      admission_location discharge_location  \
0         URGENT            P49AFC  TRANSFER FROM HOSPITAL               HOME   
1       EW EMER.            P784FA          EMERGENCY ROOM               HOME   

  insurance language marital_status   race            edregtime  \
0  Medicaid  English        WIDOWED  WHITE  2180-05-06 19:17:00   
1  Medicaid  English        WIDOWED  WHITE  2180-06-26 15:54:00   

             edouttime  hospital_expire_flag  
0  2180-05-06 23:30:00                     0  
1  2180-06-26 21:31:00                     0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546028 entries, 0 to 546027
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype 
-