# PhysioNet2019 Implementation

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

import logging

logging.basicConfig(level=logging.INFO)

In [None]:
from tsdm.datasets import SingleFrameDataset
from pandas import DataFrame
import pandas as pd
from pathlib import Path
from functools import cached_property


class PhysioNet2019(SingleFrameDataset):
    BASE_URL: str = r"https://archive.physionet.org/users/shared/challenge-2019/"
    r"""HTTP address from where the dataset can be downloaded"""
    INFO_URL: str = r"https://physionet.org/content/challenge-2019/"
    r"""HTTP address containing additional information about the dataset"""

    rawdata_files: dict[str, str] = {"A": "training_setA.zip", "B": "training_setB.zip"}

    @cached_property
    def units(self) -> DataFrame:
        units = [
            # Vital signs (columns 1-8)
            ("HR", "Heart rate", "beats per minute"),
            ("O2Sat", "Pulse oximetry", "%"),
            ("Temp", "Temperature", "Deg C"),
            ("SBP", "Systolic BP", "mm Hg"),
            ("MAP", "Mean arterial pressure", "mm Hg"),
            ("DBP", "Diastolic BP", "mm Hg"),
            ("Resp", "Respiration rate", "breaths per minute"),
            ("EtCO2", "End tidal carbon dioxide", "mm Hg"),
            # Laboratory values (columns 9-34)
            ("BaseExcess", "Measure of excess bicarbonate", "mmol/L"),
            ("HCO3", "Bicarbonate", "mmol/L"),
            ("FiO2", "Fraction of inspired oxygen", "%"),
            ("pH", "N/A", "N/A"),
            (
                "PaCO2",
                "Partial pressure of carbon dioxide from arterial blood",
                "mm Hg",
            ),
            ("SaO2", "Oxygen saturation from arterial blood", "%"),
            ("AST", "Aspartate transaminase", "IU/L"),
            ("BUN", "Blood urea nitrogen", "mg/dL"),
            ("Alkalinephos", "Alkaline phosphatase", "IU/L"),
            ("Calcium", "N/A", "mg/dL"),
            ("Chloride", "N/A", "mmol/L"),
            ("Creatinine", "N/A", "mg/dL"),
            ("Bilirubin_direct", "Bilirubin direct", "mg/dL"),
            ("Glucose", "Serum glucose", "mg/dL"),
            ("Lactate", "Lactic acid", "mg/dL"),
            ("Magnesium", "N/A", "mmol/dL"),
            ("Phosphate", "N/A", "mg/dL"),
            ("Potassium", "N/A", "mmol/L"),
            ("Bilirubin_total", "Total bilirubin", "mg/dL"),
            ("TroponinI", "Troponin I", "ng/mL"),
            ("Hct", "Hematocrit", "%"),
            ("Hgb", "Hemoglobin", "g/dL"),
            ("PTT", "partial thromboplastin time", "seconds"),
            ("WBC", "Leukocyte count", "count*10^3/µL"),
            ("Fibrinogen", "N/A", "mg/dL"),
            ("Platelets", "N/A", "count*10^3/µL"),
            # Demographics (columns 35-40)
            ("Age", "Years (100 for patients 90 or above)"),
            ("Gender", "Female (0) or Male (1)", "N/A"),
            ("Unit1", "Administrative identifier for ICU unit", "MICU"),
            ("Unit2", "Administrative identifier for ICU unit", "SICU"),
            ("HospAdmTime", "Hours between hospital admit and ICU admit", "N/A"),
            ("ICULOS", "ICU length-of-stay (hours since ICU admit)", "N/A"),
            # Outcome (column 41)
            (
                "SepsisLabel",
                "For sepsis patients, SepsisLabel is 1 if t≥tsepsis−6 and 0 if t<tsepsis−6. "
                "For non-sepsis patients, SepsisLabel is 0.",
                "N/A",
            ),
        ]

        units = pd.DataFrame(
            units, columns=["variable", "description", "unit"], dtype="string"
        )
        units = units.replace("N/A", pd.NA)
        units = units.set_index("variable")

        dtypes = {key: "Float32" for key in units.index} | {
            "Gender": "boolean",
            "Unit1": "boolean",
            "Unit2": "boolean",
            "ICULOS": "Int32",
            "SepsisLabel": "boolean",
        }

        units["dtype"] = pd.Series(dtypes)
        return units

    def _get_frame(self, path) -> DataFrame:
        with ZipFile(path) as archive, tqdm(archive.namelist()) as progress_bar:
            frames = {}
            progress_bar.set_description(f"Loading patient data {path.stem}")

            for compressed_file in progress_bar:
                path = Path(compressed_file)
                name = path.stem[1:]
                if not path.suffix == ".psv":
                    continue
                with archive.open(compressed_file) as file:
                    df = pd.read_csv(file, sep="|", header=0)
                    frames[name] = df

        self.LOGGER.info("Concatingating DataFrames")
        frame = pd.concat(frames, names=["patient", "time"])
        frame = frame.astype(self.units["dtype"])
        frame.columns.name = "variable"
        return frame

    def _clean(self) -> DataFrame:
        frames = {
            key: self._get_frame(path) for key, path in self.rawdata_paths.items()
        }
        frame = pd.concat(frames, names=["set"])
        return frame

In [None]:
ds = PhysioNet2019()

In [None]:
ds.units

In [None]:
ds.info()

In [None]:
ds.rawdata_paths

## Metadata

In [None]:
units = [
    # Vital signs (columns 1-8)
    ("HR", "Heart rate", "beats per minute"),
    ("O2Sat", "Pulse oximetry", "%"),
    ("Temp", "Temperature", "Deg C"),
    ("SBP", "Systolic BP", "mm Hg"),
    ("MAP", "Mean arterial pressure", "mm Hg"),
    ("DBP", "Diastolic BP", "mm Hg"),
    ("Resp", "Respiration rate", "breaths per minute"),
    ("EtCO2", "End tidal carbon dioxide", "mm Hg"),
    # Laboratory values (columns 9-34)
    ("BaseExcess", "Measure of excess bicarbonate", "mmol/L"),
    ("HCO3", "Bicarbonate", "mmol/L"),
    ("FiO2", "Fraction of inspired oxygen", "%"),
    ("pH", "N/A", "N/A"),
    ("PaCO2", "Partial pressure of carbon dioxide from arterial blood", "mm Hg"),
    ("SaO2", "Oxygen saturation from arterial blood", "%"),
    ("AST", "Aspartate transaminase", "IU/L"),
    ("BUN", "Blood urea nitrogen", "mg/dL"),
    ("Alkalinephos", "Alkaline phosphatase", "IU/L"),
    ("Calcium", "N/A", "mg/dL"),
    ("Chloride", "N/A", "mmol/L"),
    ("Creatinine", "N/A", "mg/dL"),
    ("Bilirubin_direct", "Bilirubin direct", "mg/dL"),
    ("Glucose", "Serum glucose", "mg/dL"),
    ("Lactate", "Lactic acid", "mg/dL"),
    ("Magnesium", "N/A", "mmol/dL"),
    ("Phosphate", "N/A", "mg/dL"),
    ("Potassium", "N/A", "mmol/L"),
    ("Bilirubin_total", "Total bilirubin", "mg/dL"),
    ("TroponinI", "Troponin I", "ng/mL"),
    ("Hct", "Hematocrit", "%"),
    ("Hgb", "Hemoglobin", "g/dL"),
    ("PTT", "partial thromboplastin time", "seconds"),
    ("WBC", "Leukocyte count", "count*10^3/µL"),
    ("Fibrinogen", "N/A", "mg/dL"),
    ("Platelets", "N/A", "count*10^3/µL"),
    # Demographics (columns 35-40)
    ("Age", "Years (100 for patients 90 or above)"),
    ("Gender", "Female (0) or Male (1)", "N/A"),
    ("Unit1", "Administrative identifier for ICU unit", "MICU"),
    ("Unit2", "Administrative identifier for ICU unit", "SICU"),
    ("HospAdmTime", "Hours between hospital admit and ICU admit", "N/A"),
    ("ICULOS", "ICU length-of-stay (hours since ICU admit)", "N/A"),
    # Outcome (column 41)
    (
        "SepsisLabel",
        "For sepsis patients, SepsisLabel is 1 if t≥tsepsis−6 and 0 if t<tsepsis−6. "
        "For non-sepsis patients, SepsisLabel is 0.",
        "N/A",
    ),
]

units = pd.DataFrame(units, columns=["variable", "description", "unit"], dtype="string")
units = units.replace("N/A", pd.NA)
units = units.set_index("variable")

dtypes = {key: "Float32" for key in units.index} | {
    "Gender": "boolean",
    "Unit1": "boolean",
    "Unit2": "boolean",
    "ICULOS": "Int32",
    "SepsisLabel": "boolean",
}

units["dtype"] = pd.Series(dtypes)

In [None]:
units

In [None]:
pth = ds.rawdata_paths[0]

In [None]:
for col in df.columns:
    try:
        df[col].astype("Int32")
    except:
        print(col)

In [None]:
df["HR"].unique()

In [None]:
file = "/home/rscholz/.tsdm/rawdata/PhysioNet2019/training_setA/training/p000001.psv"
df = pd.read_csv(file, sep="|", header=0)

## Loading all files

In [None]:
from tqdm.autonotebook import tqdm, trange

In [None]:
from zipfile import ZipFile
from pathlib import Path
import pandas as pd

In [None]:
pth = ds.rawdata_paths[1]

with ZipFile(pth) as archive, tqdm(archive.namelist()) as progress_bar:
    frames = {}
    progress_bar.set_description(f"Loading patient data")

    for compressed_file in progress_bar:
        path = Path(compressed_file)
        name = path.stem[1:]
        if not path.suffix == ".psv":
            continue
        with archive.open(compressed_file) as file:
            df = pd.read_csv(file, sep="|", header=0)
            frames[name] = df

In [None]:
%%time
FRAME = pd.concat(frames, names=["patient", "time"]);

In [None]:
FRAME = FRAME.astype(dtypes)

In [None]:
FRAME.EtCO2.unique()

In [None]:
FRAME

In [None]:
def _get_frame(path):
    with ZipFile(path) as archive, tqdm(
        archive.namelist(), leave=False
    ) as progress_bar:
        frames = {}
        progress_bar.set_description(f"Loading patient data {path.stem}")

        for compressed_file in progress_bar:
            path = Path(compressed_file)
            name = path.stem[1:]
            if not path.suffix == ".psv":
                continue
            with archive.open(compressed_file) as file:
                df = pd.read_csv(file, sep="|", header=0)
                frames[name] = df

    frame = pd.concat(frames, names=["patient", "time"])
    frame = frame.astype(units["dtype"])
    frame.columns.name = "variable"
    return frame

In [None]:
import os
from pathlib import Path

In [None]:
from typing import Union

In [None]:
Union[str, int].__args__

In [None]:
split = {key: frame.index.unique(level="patient") for key, frame in frames.items()}
pd.Series(split)

In [None]:
%%time
frame = pd.concat(frames, names=["set"])

In [None]:
frame

In [None]:
frames = {key: _get_frame(path) for key, path in ds.rawdata_paths.items()}
frame = pandas.concat(frames)

In [None]:
ds.rawdata_paths

In [None]:
{key: path for key, path in ds.rawdata_paths.items()}