# MIMIC 4 data - dataset construction prescriptions

In [None]:
import gzip
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import pyarrow
import pyarrow.csv
import pyarrow.parquet

# Load `prescriptions` table

## Table Schema

In [None]:
rawdata_file = Path.cwd() / "mimic-iv-1.0.zip"
dataset_path = Path.cwd() / "processed"
rawdata_path = Path.cwd() / "raw"

files = {
    "prescriptions": "mimic-iv-1.0/hosp/prescriptions.csv.gz",
}

CATEGORY = pyarrow.dictionary("int32", "string")
ID_TYPE = "int32"  # pyarrow.dictionary("int32", "int32", ordered=True)


column_types = {
    "prescriptions": {
        "subject_id": "int32",
        "hadm_id": "int32",
        "pharmacy_id": "int32",
        "starttime": "timestamp[s]",
        "stoptime": "timestamp[s]",
        "drug_type": CATEGORY,
        "drug": CATEGORY,
        "gsn": CATEGORY,
        "ndc": "float32",
        "prod_strength": CATEGORY,
        "form_rx": CATEGORY,
        "dose_val_rx": "string",  # float or interval[float]
        "dose_unit_rx": CATEGORY,
        "form_val_disp": CATEGORY,  # float or interval[float]
        "form_unit_disp": CATEGORY,
        "doses_per_24_hrs": "float32",
        "route": CATEGORY,
    }
}

null_values = [
    "-",
    "-1.#IND",
    "-1.#QNAN",
    "-NaN",
    "-nan",
    "?",
    "",
    "#N/A N/A",
    "#N/A",
    "#NA",
    "#na",
    "<N/A>",
    "<n/a>",
    "<NA>",
    "<na>",
    "1.#IND",
    "1.#QNAN",
    "INFORMATION NOT AVAILABLE",
    "N/A",
    "n/a",
    "NA",
    "na",
    "NAN",
    "NaN",
    "nan",
    "NONE",
    "None",
    "none",
    "NULL",
    "NULL",
    "Null",
    "null",
    "UNABLE TO OBTAIN",
    "UNKNOWN",
    "unknown",
]

types_map = {
    "string": pd.StringDtype(),
    "bool": pd.BooleanDtype(),
    "int8": pd.Int8Dtype(),
    "int16": pd.Int16Dtype(),
    "int32": pd.Int32Dtype(),
    "int64": pd.Int64Dtype(),
    "uint8": pd.UInt8Dtype(),
    "uint16": pd.UInt16Dtype(),
    "uint32": pd.UInt32Dtype(),
    "uint64": pd.UInt64Dtype(),
}

## Load Prescriptions

In [None]:
%%time
key = "prescriptions"
with (
    ZipFile(rawdata_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    prescriptions = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=null_values,
        ),
    )

prescriptions.shape, prescriptions.schema

## Store and reload as pandas

In [None]:
pyarrow.parquet.write_table(prescriptions, rawdata_path / f"{key}.parquet")
prescriptions = prescriptions.to_pandas(self_destruct=True, types_mapper=types_map.get)
prescriptions

# Filter Dataset

## Only choose previously selected admission ids

In [None]:
admissions = pd.read_parquet(dataset_path / "admissions_processed.parquet")

for key in ["hadm_id", "subject_id"]:
    mask = prescriptions[key].isin(admissions[key])
    prescriptions = prescriptions[mask]
    print(f"Removing {(~mask).sum()} {key}")
    print(f"Number of patients remaining: {prescriptions['subject_id'].nunique()}")
    print(f"Number of admissions remaining: {prescriptions['hadm_id'].nunique()}")
    print(f"Number of events remaining: {prescriptions.shape}")

## Only keep data with valid starttime

In [None]:
mask = prescriptions["starttime"].notna()
prescriptions = prescriptions[mask]

## Only keep data with float-value measurement

In [None]:
mask = pd.to_numeric(prescriptions["dose_val_rx"], errors="coerce").notna()
prescriptions = prescriptions[mask]
prescriptions["dose_val_rx"] = prescriptions["dose_val_rx"].astype("float32")

## Select entries whose drug name is in the list from the paper.

In [None]:
drugs_list = [
    "Acetaminophen",
    "Aspirin",
    "Bisacodyl",
    "Insulin",
    "Heparin",
    "Docusate Sodium",
    "D5W",
    "Humulin-R Insulin",
    "Potassium Chloride",
    "Magnesium Sulfate",
    "Metoprolol Tartrate",
    "Sodium Chloride 0.9%  Flush",
    "Pantoprazole",
]

prescriptions = prescriptions[prescriptions["drug"].isin(drugs_list)]
print(f"Number of events remaining: {prescriptions.shape}")
print(f"Number of patients remaining: {prescriptions['subject_id'].nunique()}")
prescriptions.groupby("drug")["dose_unit_rx"].value_counts()

# Correct Units

In [None]:
prescriptions = prescriptions[prescriptions["dose_unit_rx"].notna()]
print(f"Number of events remaining: {prescriptions.shape}")
print(f"Number of patients remaining: {prescriptions['subject_id'].nunique()}")

In [None]:
prescriptions["dose_unit_rx"].replace("mL", "ml", inplace=True)

check_dose_unit_rx = {
    "Acetaminophen": "mg",
    "D5W": "ml",
    "Heparin": "UNIT",
    "Insulin": "UNIT",
    "Magnesium Sulfate": "gm",
    "Potassium Chloride": "mEq",
    "Bisacodyl": "mg",
    "Pantoprazole": "mg",
}

MASK = pd.Series(False, index=prescriptions.index)
for drug, unit in check_dose_unit_rx.items():
    mask = (prescriptions["drug"] == drug) & (prescriptions["dose_unit_rx"] != unit)
    print(f"Removing {mask.sum():5d} entries - bad data in {drug} (drop {unit})")
    MASK |= mask

prescriptions = prescriptions[~MASK].copy()
prescriptions.groupby("drug")["dose_unit_rx"].value_counts()

# Serialize Pre-processed DataFrame

In [None]:
prescriptions["charttime"] = prescriptions["starttime"]

# To avoid confounding labels with labels from other tables, we add "drug" to the name
prescriptions["drug"] = prescriptions["drug"].astype("string") + " (prescription)"
prescriptions["drug"] = prescriptions["drug"].astype("category")


# Clean categories
def clean_categories(df):
    for col in df:
        if df[col].dtype == "category":
            df[col] = df[col].cat.remove_unused_categories()
    return df


prescriptions = clean_categories(prescriptions)
prescriptions.to_parquet(dataset_path / "prescriptions_processed.parquet")
prescriptions.shape, prescriptions.dtypes