Connect to MIMIC IV database.

In [1]:
import toml

config = toml.load("../configs/db.toml")
user = config["user"]
password = config["password"]
host = config["host"]
dbname = config["dbname"]

In [2]:
from sqlalchemy import create_engine

engine = create_engine(f"postgresql://{user}:{password}@{host}/{dbname}")
connection = engine.connect()

Retrieve index date of patients in case group.

The patients with psychosis belong to case group, the others belong to control group.

The index date for case group is the first admit date of psychosis.

In [3]:
import sys
from pathlib import Path

sys.path.append(str(Path("..").resolve())) # enable to import utils.icd

In [4]:
from utils.icd import load_icd

psychosis_icd = load_icd("psychosis")

In [5]:
import pandas as pd

case = (
    pd.read_sql_query(
        f"""
        SELECT
            subject_id,
            MIN(admit_date) AS index_date
        FROM (
            SELECT
                subject_id,
                DATE(admittime) AS admit_date,
                icd_code,
                icd_version
            FROM mimic_hosp.diagnoses_icd
            NATURAL JOIN mimic_core.admissions
            UNION ALL
            SELECT
                subject_id,
                DATE(intime) AS admit_date,
                icd_code,
                icd_version
            FROM mimic_ed.diagnosis
            NATURAL JOIN mimic_ed.edstays
        ) AS all_diagnoses
        WHERE (
            icd_version = 10
            AND icd_code IN {psychosis_icd.v10}
        )
        OR (
            icd_version = 9
            AND icd_code IN {psychosis_icd.v9}
        )
        GROUP BY subject_id
        """,
        connection,
        index_col="subject_id",
        parse_dates=["index_date"],
    )
)
case

Unnamed: 0_level_0,index_date
subject_id,Unnamed: 1_level_1
10000117,2181-11-15
10000883,2124-05-14
10000935,2187-10-10
10001180,2197-06-06
10001884,2130-08-21
...,...
19998198,2128-09-06
19998444,2155-06-03
19999043,2164-12-18
19999466,2116-08-30


Retrieve index date of patients in control group.

The index date for control group is the first admit date.

In [6]:
control = (
    pd.read_sql_query(
        f"""
        SELECT
            subject_id,
            MIN(admit_date) AS index_date
        FROM (
            SELECT
                subject_id,
                DATE(admittime) AS admit_date
            FROM mimic_hosp.diagnoses_icd
            NATURAL JOIN mimic_core.admissions
            UNION ALL
            SELECT
                subject_id,
                DATE(intime) AS admit_date
            FROM mimic_ed.diagnosis
            NATURAL JOIN mimic_ed.edstays
        ) AS all_diagnoses
        GROUP BY subject_id
        """,
        connection,
        index_col="subject_id",
        parse_dates=["index_date"],
    )
    .query("~index.isin(@case.index)")
)
control

Unnamed: 0_level_0,index_date
subject_id,Unnamed: 1_level_1
10000019,2129-05-21
10000032,2180-05-06
10000044,2150-09-30
10000068,2160-03-03
10000074,2110-10-16
...,...
19999768,2139-03-25
19999784,2119-06-18
19999840,2164-07-25
19999914,2158-12-24


Concatenate case and control group as design matrix.

The design matrix is the main dataframe contains patients' information.

In [7]:
design_matrix = pd.concat([case, control], verify_integrity=True, copy=False)
design_matrix

Unnamed: 0_level_0,index_date
subject_id,Unnamed: 1_level_1
10000117,2181-11-15
10000883,2124-05-14
10000935,2187-10-10
10001180,2197-06-06
10001884,2130-08-21
...,...
19999768,2139-03-25
19999784,2119-06-18
19999840,2164-07-25
19999914,2158-12-24


Add predictor column to design matrix.

The predictor is a boolean represents patient in case or control group in the analysis.

In [8]:
design_matrix["with_psychosis"] = design_matrix.index.isin(case.index)
design_matrix.attrs["predictor_col"] = "with_psychosis"
design_matrix

Unnamed: 0_level_0,index_date,with_psychosis
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10000117,2181-11-15,True
10000883,2124-05-14,True
10000935,2187-10-10,True
10001180,2197-06-06,True
10001884,2130-08-21,True
...,...,...
19999768,2139-03-25,False
19999784,2119-06-18,False
19999840,2164-07-25,False
19999914,2158-12-24,False


Drop patients in case group whose index date is equal to the last date to avoid the duration will be 0.

In [10]:
print(f"before case group dropping: {len(design_matrix)}")
design_matrix.drop(
    index=(
        pd.read_sql_query(
            f"""
            SELECT
                subject_id,
                MAX(admit_date) AS last_date
            FROM (
                SELECT
                    subject_id,
                    DATE(admittime) AS admit_date
                FROM mimic_hosp.diagnoses_icd
                NATURAL JOIN mimic_core.admissions
                UNION ALL
                SELECT
                    subject_id,
                    DATE(intime) AS admit_date
                FROM mimic_ed.diagnosis
                NATURAL JOIN mimic_ed.edstays
            ) AS all_diagnoses
            GROUP BY subject_id
            """,
            connection,
            index_col="subject_id",
            parse_dates=["last_date"],
        )
        .loc[case.index] # apply case only
        .query("last_date == @case['index_date']")
        .index
    ),
    inplace=True,
)
print(f"after case group dropping: {len(design_matrix)}")

before case group dropping: 342251
after case group dropping: 325561


Drop patients whose index date is equal to event date to avoid the duration will be 0.

In [11]:
ischemic_stroke_icd = load_icd("ischemic_stroke")

In [12]:
print(f"before dropping: {len(design_matrix)}")
design_matrix.drop(
    index=(
        pd.read_sql_query(
            f"""
            SELECT
                subject_id,
                MIN(admit_date) AS first_date_ischemic_stroke
            FROM (
                SELECT
                    subject_id,
                    DATE(admittime) AS admit_date,
                    icd_code,
                    icd_version
                FROM mimic_hosp.diagnoses_icd
                NATURAL JOIN mimic_core.admissions
                UNION ALL
                SELECT
                    subject_id,
                    DATE(intime) AS admit_date,
                    icd_code,
                    icd_version
                FROM mimic_ed.diagnosis
                NATURAL JOIN mimic_ed.edstays
            ) AS all_diagnoses
            WHERE (
                icd_version = 10
                AND icd_code IN {ischemic_stroke_icd.v10}
            )
            OR (
                icd_version = 9
                AND icd_code IN {ischemic_stroke_icd.v9}
            )
            GROUP BY subject_id
            """,
            connection,
            index_col="subject_id",
            parse_dates=["first_date_ischemic_stroke"],
        )
        .join(design_matrix, how="right")
        .query("first_date_ischemic_stroke == index_date")
        .index
    ),
    inplace=True,
)
print(f"after dropping: {len(design_matrix)}")

before dropping: 325561
after dropping: 319402


Add event date column to design matrix.

The event date is the first admit date of ischemic stroke after index date.

Not every patient had been diagnosed psychosis after index date, so left it NaT for now.

In [13]:
def first_date_later_than_index_date(df: pd.DataFrame) -> pd.Timestamp:
    subject_id = df.name
    index_date = design_matrix.loc[subject_id, "index_date"]
    later_than_index_date_mask = df["admit_date"].gt(index_date)
    if not later_than_index_date_mask.any():
        return pd.NaT
    first_true_arg = later_than_index_date_mask.argmax()
    return df["admit_date"].iloc[first_true_arg]

In [14]:
design_matrix = (
    pd.read_sql_query(
        f"""
        SELECT
            subject_id,
            admit_date
        FROM (
            SELECT
                subject_id,
                DATE(admittime) AS admit_date,
                icd_code,
                icd_version
            FROM mimic_hosp.diagnoses_icd
            NATURAL JOIN mimic_core.admissions
            UNION ALL
            SELECT
                subject_id,
                DATE(intime) AS admit_date,
                icd_code,
                icd_version
            FROM mimic_ed.diagnosis
            NATURAL JOIN mimic_ed.edstays
        ) AS all_diagnoses
        WHERE (
            icd_version = 10
            AND icd_code IN {ischemic_stroke_icd.v10}
        )
        OR (
            icd_version = 9
            AND icd_code IN {ischemic_stroke_icd.v9}
        )
        ORDER BY admit_date
        """,
        connection,
        index_col="subject_id",
        parse_dates=["admit_date"],
    )
    .query("index.isin(@design_matrix.index)")
    .groupby("subject_id", sort=False)
    .apply(first_date_later_than_index_date)
    .rename("event_date")
    .to_frame() # series has no join method
    .join(design_matrix, how="right")
)
design_matrix

Unnamed: 0_level_0,event_date,index_date,with_psychosis
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10000117,NaT,2181-11-15,True
10001884,NaT,2130-08-21,True
10002013,NaT,2164-03-19,True
10002221,NaT,2203-06-13,True
10002428,NaT,2160-04-14,True
...,...,...,...
19999750,NaT,2144-03-22,False
19999768,NaT,2139-03-25,False
19999784,NaT,2119-06-18,False
19999914,NaT,2158-12-24,False


Add event column to design matrix.

The event is a boolean represents whether psychosis is diagnosed after index date.

In [15]:
design_matrix["E"] = design_matrix["event_date"].notna()
design_matrix

Unnamed: 0_level_0,event_date,index_date,with_psychosis,E
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000117,NaT,2181-11-15,True,False
10001884,NaT,2130-08-21,True,False
10002013,NaT,2164-03-19,True,False
10002221,NaT,2203-06-13,True,False
10002428,NaT,2160-04-14,True,False
...,...,...,...,...
19999750,NaT,2144-03-22,False,False
19999768,NaT,2139-03-25,False,False
19999784,NaT,2119-06-18,False,False
19999914,NaT,2158-12-24,False,False


For patient who didn't find event, use death date as event date instead.

In [16]:
design_matrix["event_date"].mask(
    design_matrix["event_date"].isna(),
    other=(
        pd.read_sql_query(
            """
            SELECT
                subject_id,
                DATE(dod) AS death_date
            FROM mimic_core.patients
            """,
            connection,
            index_col="subject_id",
            parse_dates=["death_date"]
        )
        .loc[design_matrix.index]
        .squeeze() # dataframe to series
    ),
    inplace=True,
)
design_matrix["event_date"].isna().value_counts()

True     307573
False     11829
Name: event_date, dtype: int64

For patient who have no death date, use 12/31/last_year as event date instead.

In [17]:
design_matrix["event_date"].mask(
    design_matrix["event_date"].isna(),
    other=(
        pd.read_sql_query(
            """
            SELECT
                subject_id,
                MAKE_DATE(CAST(MAX(admit_year) AS INTEGER), 12, 31) AS event_date
            FROM (
                SELECT
                    subject_id,
                    EXTRACT(YEAR FROM admittime) AS admit_year
                FROM mimic_hosp.diagnoses_icd
                NATURAL JOIN mimic_core.admissions
                UNION ALL
                SELECT
                    subject_id,
                    EXTRACT(YEAR FROM intime) AS admit_year
                FROM mimic_ed.diagnosis
                NATURAL JOIN mimic_ed.edstays
            ) AS all_diagnoses
            GROUP BY subject_id
            """,
            connection,
            index_col="subject_id",
            parse_dates=["event_date"],
        )
        .loc[design_matrix.index]
        .squeeze() # dataframe to series
    ),
    inplace=True,
)
assert design_matrix["event_date"].notna().all()

Add gender, age columns to design matrix.

In [18]:
design_matrix = (
    pd.read_sql_query(
        """
        SELECT
            subject_id,
            (CASE gender WHEN 'M' THEN 1 ELSE 0 END) AS gender,
            (anchor_year - anchor_age) AS birth_year
        FROM mimic_core.patients
        """,
        connection,
        index_col="subject_id",
    )
    .join(design_matrix, how="right")
    .assign(age=lambda df: df["index_date"].dt.year - df["birth_year"])
    .drop(columns="birth_year")
)
design_matrix

Unnamed: 0_level_0,gender,event_date,index_date,with_psychosis,E,age
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10000117,0,2183-12-31,2181-11-15,True,False,55
10001884,0,2131-01-20,2130-08-21,True,False,76
10002013,0,2167-12-31,2164-03-19,True,False,61
10002221,0,2204-12-31,2203-06-13,True,False,71
10002428,0,2160-12-31,2160-04-14,True,False,85
...,...,...,...,...,...,...
19999750,1,2144-12-31,2144-03-22,False,False,45
19999768,1,2139-12-31,2139-03-25,False,False,0
19999784,1,2121-12-31,2119-06-18,False,False,57
19999914,0,2158-12-31,2158-12-24,False,False,49


Drop patients in control group whose gender and age are not matchable with patients in case group.

In [19]:
case = design_matrix.query("with_psychosis")
control = design_matrix.query("~with_psychosis")

In [20]:
gender_age_case = case[["gender", "age"]]

def matchable(row: pd.Series) -> bool:
    return row.eq(gender_age_case).all(axis=1).any()

In [22]:
from tqdm import tqdm

tqdm.pandas(desc="Matching")

In [23]:
matchable_mask = control[["gender", "age"]].progress_apply(matchable, axis=1)
matchable_mask.value_counts().rename("matchable")

Matching: 100%|██████████| 301393/301393 [01:58<00:00, 2541.61it/s]


True     240555
False     60838
Name: matchable, dtype: int64

In [24]:
print(f"before control group dropping: {len(design_matrix)}")
design_matrix.drop(
    index=control[~matchable_mask].index,
    inplace=True
)
print(f"after control group dropping.: {len(design_matrix)}")

before control group dropping: 319402
after control group dropping.: 258564


Add duration column to design matrix.

The duration represents number of days between index date and event date.

In [25]:
design_matrix["T"] = design_matrix["event_date"].sub(design_matrix["index_date"]).dt.days
design_matrix

Unnamed: 0_level_0,gender,event_date,index_date,with_psychosis,E,age,T
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10000117,0,2183-12-31,2181-11-15,True,False,55,776
10001884,0,2131-01-20,2130-08-21,True,False,76,152
10002013,0,2167-12-31,2164-03-19,True,False,61,1382
10002221,0,2204-12-31,2203-06-13,True,False,71,567
10002428,0,2160-12-31,2160-04-14,True,False,85,261
...,...,...,...,...,...,...,...
19999733,0,2152-12-31,2152-07-08,False,False,19,176
19999750,1,2144-12-31,2144-03-22,False,False,45,284
19999784,1,2121-12-31,2119-06-18,False,False,57,927
19999914,0,2158-12-31,2158-12-24,False,False,49,7


Drop the patients who durations are negative (exception).

In [26]:
neg_T = design_matrix.query("T < 0")
neg_T

Unnamed: 0_level_0,gender,event_date,index_date,with_psychosis,E,age,T
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
15943273,0,2143-12-28,2147-09-10,False,False,37,-1352


In [27]:
design_matrix.drop(
    index=neg_T.index,
    inplace=True
)
assert design_matrix["T"].ge(0).all()

Add covariate columns to design matrix.

Covariate columns:
* with_hypertension
* with_heart_type_disease
* with_neurological_type_disease
* with_diabetes
* with_hyperlipidemia
* hypertension_times
* heart_type_disease_times
* neurological_type_disease_times
* diabetes_times
* hyperlipidemia_times

In [28]:
covariates = [
    "hypertension",
    "heart_type_disease",
    "neurological_type_disease",
    "diabetes",
    "hyperlipidemia",
]

In [29]:
for covariate in covariates:
    covariate_icd = load_icd(covariate)
    design_matrix = (
        pd.read_sql_query(
            f"""
            SELECT
                subject_id,
                COUNT(*) AS {covariate}_times
            FROM (
                SELECT
                    subject_id,
                    icd_code,
                    icd_version
                FROM mimic_hosp.diagnoses_icd
                UNION ALL
                SELECT
                    subject_id,
                    icd_code,
                    icd_version
                FROM mimic_ed.diagnosis
            ) AS all_diagnoses
            WHERE (
                icd_version = 10
                AND icd_code IN {str(covariate_icd.v10).replace(",)", ")")}
            )
            OR (
                icd_version = 9
                AND icd_code IN {str(covariate_icd.v9).replace(",)", ")")}
            )
            GROUP BY subject_id
            """,
            connection,
            index_col="subject_id",
        )
        .join(design_matrix, how="right")
        .fillna({f"{covariate}_times": 0})
        .astype({f"{covariate}_times": "int32"}, copy=False)
    )
    design_matrix[f"with_{covariate}"] = design_matrix[f"{covariate}_times"] > 0
design_matrix

Unnamed: 0_level_0,hyperlipidemia_times,diabetes_times,neurological_type_disease_times,heart_type_disease_times,hypertension_times,gender,event_date,index_date,with_psychosis,E,age,T,with_hypertension,with_heart_type_disease,with_neurological_type_disease,with_diabetes,with_hyperlipidemia
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10000117,1,0,1,2,0,0,2183-12-31,2181-11-15,True,False,55,776,False,True,True,False,True
10001884,16,1,0,42,22,0,2131-01-20,2130-08-21,True,False,76,152,True,True,False,True,True
10002013,6,31,2,33,14,0,2167-12-31,2164-03-19,True,False,61,1382,True,True,True,True,True
10002221,5,4,0,0,0,0,2204-12-31,2203-06-13,True,False,71,567,False,False,False,True,True
10002428,4,0,1,8,9,0,2160-12-31,2160-04-14,True,False,85,261,True,True,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19999733,0,0,1,0,0,0,2152-12-31,2152-07-08,False,False,19,176,False,False,True,False,False
19999750,0,0,0,0,0,1,2144-12-31,2144-03-22,False,False,45,284,False,False,False,False,False
19999784,0,0,7,0,6,1,2121-12-31,2119-06-18,False,False,57,927,True,False,True,False,False
19999914,0,0,0,0,0,0,2158-12-31,2158-12-24,False,False,49,7,False,False,False,False,False


In [30]:
design_matrix.attrs["with_covariate_cols"] = [f"with_{c}" for c in covariates]
design_matrix.attrs["covariate_times_cols"] = [f"{c}_times" for c in covariates]

Sort columns of design matrix.

In [33]:
cols = [
    "gender",
    "age",
    design_matrix.attrs["predictor_col"],
    "index_date",
    "event_date",
    "T",
    "E",
    *design_matrix.attrs["with_covariate_cols"],
    *design_matrix.attrs["covariate_times_cols"],
]
design_matrix = design_matrix[cols]
design_matrix

Unnamed: 0_level_0,gender,age,with_psychosis,index_date,event_date,T,E,with_hypertension,with_heart_type_disease,with_neurological_type_disease,with_diabetes,with_hyperlipidemia,hypertension_times,heart_type_disease_times,neurological_type_disease_times,diabetes_times,hyperlipidemia_times
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10000117,0,55,True,2181-11-15,2183-12-31,776,False,False,True,True,False,True,0,2,1,0,1
10001884,0,76,True,2130-08-21,2131-01-20,152,False,True,True,False,True,True,22,42,0,1,16
10002013,0,61,True,2164-03-19,2167-12-31,1382,False,True,True,True,True,True,14,33,2,31,6
10002221,0,71,True,2203-06-13,2204-12-31,567,False,False,False,False,True,True,0,0,0,4,5
10002428,0,85,True,2160-04-14,2160-12-31,261,False,True,True,True,False,True,9,8,1,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19999733,0,19,False,2152-07-08,2152-12-31,176,False,False,False,True,False,False,0,0,1,0,0
19999750,1,45,False,2144-03-22,2144-12-31,284,False,False,False,False,False,False,0,0,0,0,0
19999784,1,57,False,2119-06-18,2121-12-31,927,False,True,False,True,False,False,6,0,7,0,0
19999914,0,49,False,2158-12-24,2158-12-31,7,False,False,False,False,False,False,0,0,0,0,0


Save design matrix.

In [34]:
design_matrix.to_pickle("../data/design_matrix_psychosis_ischemic_stroke.pkl")