Connect to MIMIC IV database.

In [1]:
import toml

config = toml.load("../configs/db.toml")
user = config["user"]
password = config["password"]
host = config["host"]
dbname = config["dbname"]

In [2]:
from sqlalchemy import create_engine

engine = create_engine(f"postgresql://{user}:{password}@{host}/{dbname}")
connection = engine.connect()

Retrieve gender and age of patients with psychosis as design matrix.

The design matrix is the main dataframe contains patients' information.

In [3]:
import sys
from pathlib import Path

sys.path.append(str(Path("..").resolve())) # enable to import utils.icd

In [4]:
from utils.icd import load_icd

psychosis_icd = load_icd("psychosis")

In [5]:
import pandas as pd

design_matrix = (
    pd.read_sql_query(
        f"""
        SELECT
            subject_id,
            (CASE gender WHEN 'M' THEN 1 ELSE 0 END) AS gender,
            (CAST(first_psychosis_admit_year - (anchor_year - anchor_age) AS INTEGER)) AS age
        FROM (
            SELECT
                subject_id,
                MIN(admit_year) AS first_psychosis_admit_year
            FROM (
                SELECT
                    subject_id,
                    icd_code,
                    icd_version,
                    EXTRACT(YEAR FROM admittime) AS admit_year
                FROM mimic_hosp.diagnoses_icd
                NATURAL JOIN mimic_core.admissions
                UNION ALL
                SELECT
                    subject_id,
                    icd_code,
                    icd_version,
                    EXTRACT(YEAR FROM intime) AS admit_year
                FROM mimic_ed.diagnosis
                NATURAL JOIN mimic_ed.edstays
            ) AS all_diagnoses
            WHERE (
                icd_version = 10
                AND icd_code IN {psychosis_icd.v10}
            )
            OR (
                icd_version = 9
                AND icd_code IN {psychosis_icd.v9}
            )
            GROUP BY subject_id
        ) AS all_first_psychosis_admit_year
        NATURAL JOIN mimic_core.patients
        """,
        connection,
        index_col="subject_id",
        parse_dates=["death_date"],
    )
    .astype({"age": "int"})
)
design_matrix

Unnamed: 0_level_0,gender,age
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10074117,0,55
10777705,0,65
10828368,1,50
11105181,1,37
11110307,0,39
...,...,...
19988595,1,75
19995244,0,80
19996832,0,19
19997448,0,52


Add index date column to design matrix.

For case group, the index date is the first admit date of digestive disorders.

The patients with digestive disorders belong to case group, the others belong to control group.

Patients in control group hadn't been diagnosed psychosis after index date, so left it NaT for now.

In [6]:
digestive_disorders_icd = load_icd("digestive_disorders")

In [7]:
design_matrix = (
    pd.read_sql_query(
        f"""
        SELECT
            subject_id,
            MIN(admit_date) AS index_date
        FROM (
            SELECT
                subject_id,
                DATE(admittime) AS admit_date,
                icd_code,
                icd_version
            FROM mimic_hosp.diagnoses_icd
            NATURAL JOIN mimic_core.admissions
            UNION ALL
            SELECT
                subject_id,
                DATE(intime) AS admit_date,
                icd_code,
                icd_version
            FROM mimic_ed.diagnosis
            NATURAL JOIN mimic_ed.edstays
        ) AS all_diagnoses
        WHERE (
            icd_version = 10
            AND icd_code IN {digestive_disorders_icd.v10}
        )
        OR (
            icd_version = 9
            AND icd_code IN {digestive_disorders_icd.v9}
        )
        GROUP BY subject_id
        """,
        connection,
        index_col="subject_id",
        parse_dates=["index_date"],
    )
    .join(design_matrix, how="right")
)
design_matrix

Unnamed: 0_level_0,index_date,gender,age
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10074117,2118-03-13,0,55
10777705,2150-08-12,0,65
10828368,NaT,1,50
11105181,2174-08-23,1,37
11110307,NaT,0,39
...,...,...,...
19988595,2129-11-27,1,75
19995244,2111-01-11,0,80
19996832,NaT,0,19
19997448,NaT,0,52


Add predictor column to design matrix.

The predictor is a boolean represents patient in case or control group in the analysis.

In [8]:
design_matrix["with_digestive_disorders"] = design_matrix["index_date"].notna()
design_matrix.attrs["predictor_col"] = "with_digestive_disorders"
design_matrix

Unnamed: 0_level_0,index_date,gender,age,with_digestive_disorders
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10074117,2118-03-13,0,55,True
10777705,2150-08-12,0,65,True
10828368,NaT,1,50,False
11105181,2174-08-23,1,37,True
11110307,NaT,0,39,False
...,...,...,...,...
19988595,2129-11-27,1,75,True
19995244,2111-01-11,0,80,True
19996832,NaT,0,19,False
19997448,NaT,0,52,False


Group patients into case and control groups.

In [9]:
case = design_matrix.query("with_digestive_disorders")
case

Unnamed: 0_level_0,index_date,gender,age,with_digestive_disorders
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10074117,2118-03-13,0,55,True
10777705,2150-08-12,0,65,True
11105181,2174-08-23,1,37,True
11339657,2125-07-03,1,91,True
11359914,2131-10-12,0,63,True
...,...,...,...,...
19971094,2187-03-10,1,91,True
19977062,2135-01-19,0,62,True
19988595,2129-11-27,1,75,True
19995244,2111-01-11,0,80,True


In [10]:
control = design_matrix.query("~with_digestive_disorders")
control

Unnamed: 0_level_0,index_date,gender,age,with_digestive_disorders
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10828368,NaT,1,50,False
11110307,NaT,0,39,False
11131399,NaT,1,64,False
11679057,NaT,0,21,False
11799380,NaT,0,43,False
...,...,...,...,...
19959949,NaT,1,39,False
19962526,NaT,0,79,False
19978886,NaT,1,53,False
19996832,NaT,0,19,False


For control group, the index date is the first admit date.

In [11]:
design_matrix.loc[control.index, "index_date"] = (
    pd.read_sql_query(
        """
        SELECT
            subject_id,
            MIN(admit_date) AS index_date
        FROM (
            SELECT
                subject_id,
                DATE(admittime) AS admit_date
            FROM mimic_hosp.diagnoses_icd
            NATURAL JOIN mimic_core.admissions
            UNION ALL
            SELECT
                subject_id,
                DATE(intime) AS admit_date
            FROM mimic_ed.diagnosis
            NATURAL JOIN mimic_ed.edstays
        ) AS all_diagnoses
        GROUP BY subject_id
        """,
        connection,
        index_col="subject_id",
        parse_dates=["index_date"],
    )
    .loc[control.index]
)
assert design_matrix["index_date"].all()
design_matrix

Unnamed: 0_level_0,index_date,gender,age,with_digestive_disorders
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10074117,2118-03-13,0,55,True
10777705,2150-08-12,0,65,True
10828368,2184-08-20,1,50,False
11105181,2174-08-23,1,37,True
11110307,2156-05-08,0,39,False
...,...,...,...,...
19988595,2129-11-27,1,75,True
19995244,2111-01-11,0,80,True
19996832,2179-02-21,0,19,False
19997448,2121-10-03,0,52,False


Drop patients in control group whose gender and age are not matchable with patients in case group.

In [12]:
gender_age_case = case[["gender", "age"]]

def matchable(row: pd.Series) -> bool:
    return row.eq(gender_age_case).all(axis=1).any()

In [13]:
from tqdm import tqdm

tqdm.pandas(desc="Matching")

In [14]:
matchable_mask = control[["gender", "age"]].progress_apply(matchable, axis=1)
matchable_mask.value_counts().rename("matchable")

Matching: 100%|██████████| 15093/15093 [00:05<00:00, 2596.93it/s]


True     15088
False        5
Name: matchable, dtype: int64

In [15]:
print(f"before control dropping: {len(design_matrix)}")
design_matrix.drop(
    index=control[~matchable_mask].index,
    inplace=True
)
print(f"after control dropping: {len(design_matrix)}")

before control dropping: 35053
after control dropping: 35048


Add event date column to design matrix.

The event date is the first admit date of psychosis after index date.

Not every patient had been diagnosed psychosis after index date, so left it NaT for now.

In [16]:
def first_date_later_than_index_date(df: pd.DataFrame) -> pd.Timestamp:
    index_date = design_matrix.loc[df.name, "index_date"]
    later_than_index_date_mask = df["admit_date"] > index_date
    if not later_than_index_date_mask.any():
        return pd.NaT
    first_true_arg = later_than_index_date_mask.argmax()
    return df["admit_date"].iloc[first_true_arg]

In [17]:
design_matrix = (
    pd.read_sql_query(
        f"""
        SELECT
            subject_id,
            admit_date
        FROM (
            SELECT
                subject_id,
                icd_code,
                icd_version,
                DATE(admittime) AS admit_date
            FROM mimic_hosp.diagnoses_icd
            NATURAL JOIN mimic_core.admissions
            UNION ALL
            SELECT
                subject_id,
                icd_code,
                icd_version,
                DATE(intime) AS admit_date
            FROM mimic_ed.diagnosis
            NATURAL JOIN mimic_ed.edstays
        ) AS all_diagnoses
        WHERE (
            icd_version = 10
            AND icd_code IN {psychosis_icd.v10}
        )
        OR (
            icd_version = 9
            AND icd_code IN {psychosis_icd.v9}
        )
        """,
        connection,
        index_col="subject_id",
        parse_dates=["admit_date"],
    )
    # prepare event date column
    .loc[design_matrix.index]
    .groupby("subject_id", sort=False)
    .apply(first_date_later_than_index_date)
    .rename("event_date")
    .to_frame() # series has no join method
    # join event date column to design matrix
    .join(design_matrix, how="right")
)
design_matrix

Unnamed: 0_level_0,event_date,index_date,gender,age,with_digestive_disorders
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10074117,NaT,2118-03-13,0,55,True
10777705,2150-08-13,2150-08-12,0,65,True
10828368,2185-02-07,2184-08-20,1,50,False
11105181,NaT,2174-08-23,1,37,True
11110307,NaT,2156-05-08,0,39,False
...,...,...,...,...,...
19988595,2137-01-25,2129-11-27,1,75,True
19995244,NaT,2111-01-11,0,80,True
19996832,NaT,2179-02-21,0,19,False
19997448,2122-07-22,2121-10-03,0,52,False


Add event column to design matrix.

The event is a boolean represents whether psychosis is diagnosed after index date.

In [18]:
design_matrix["E"] = design_matrix["event_date"].notna()
design_matrix

Unnamed: 0_level_0,event_date,index_date,gender,age,with_digestive_disorders,E
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10074117,NaT,2118-03-13,0,55,True,False
10777705,2150-08-13,2150-08-12,0,65,True,True
10828368,2185-02-07,2184-08-20,1,50,False,True
11105181,NaT,2174-08-23,1,37,True,False
11110307,NaT,2156-05-08,0,39,False,False
...,...,...,...,...,...,...
19988595,2137-01-25,2129-11-27,1,75,True,True
19995244,NaT,2111-01-11,0,80,True,False
19996832,NaT,2179-02-21,0,19,False,False
19997448,2122-07-22,2121-10-03,0,52,False,True


For patient who didn't find event, use death date as event date instead.

In [19]:
design_matrix["event_date"].mask(
    design_matrix["event_date"].isna(),
    other=(
        pd.read_sql_query(
            """
            SELECT
                subject_id,
                DATE(dod) AS death_date
            FROM mimic_core.patients
            """,
            connection,
            index_col="subject_id",
            parse_dates=["death_date"],
        )
        .loc[design_matrix.index]
        .squeeze() # dataframe to series
    ),
    inplace=True,
)
design_matrix["event_date"].isna().value_counts()

False    20914
True     14134
Name: event_date, dtype: int64

For patient who have no death date, use 12/31/last_year as event date instead.

In [20]:
design_matrix["event_date"].mask(
    design_matrix["event_date"].isna(),
    other=(
        pd.read_sql_query(
            """
            SELECT
                subject_id,
                MAKE_DATE(CAST(MAX(admit_year) AS INTEGER), 12, 31) AS event_date
            FROM (
                SELECT
                    subject_id,
                    EXTRACT(YEAR FROM admittime) AS admit_year
                FROM mimic_hosp.diagnoses_icd
                NATURAL JOIN mimic_core.admissions
                UNION ALL
                SELECT
                    subject_id,
                    EXTRACT(YEAR FROM intime) AS admit_year
                FROM mimic_ed.diagnosis
                NATURAL JOIN mimic_ed.edstays
            ) AS all_diagnoses
            GROUP BY subject_id
            """,
            connection,
            index_col="subject_id",
            parse_dates=["event_date"],
        )
        .loc[design_matrix.index]
        .squeeze() # dataframe to series
    ),
    inplace=True,
)
assert design_matrix["event_date"].notna().all()

Add duration column to design matrix.

The duration represents number of days between index date and event date.

In [21]:
design_matrix["T"] = design_matrix["event_date"].sub(design_matrix["index_date"]).dt.days
assert design_matrix["T"].ge(0).all()
design_matrix

Unnamed: 0_level_0,event_date,index_date,gender,age,with_digestive_disorders,E,T
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10074117,2118-12-31,2118-03-13,0,55,True,False,293
10777705,2150-08-13,2150-08-12,0,65,True,True,1
10828368,2185-02-07,2184-08-20,1,50,False,True,171
11105181,2174-12-31,2174-08-23,1,37,True,False,130
11110307,2156-12-31,2156-05-08,0,39,False,False,237
...,...,...,...,...,...,...,...
19988595,2137-01-25,2129-11-27,1,75,True,True,2616
19995244,2111-12-31,2111-01-11,0,80,True,False,354
19996832,2179-12-31,2179-02-21,0,19,False,False,313
19997448,2122-07-22,2121-10-03,0,52,False,True,292


Add covariate columns to design matrix.

Covariate columns:
* with_hypertension
* with_heart_type_disease
* with_neurological_type_disease
* with_diabetes
* with_hyperlipidemia
* hypertension_times
* heart_type_disease_times
* neurological_type_disease_times
* diabetes_times
* hyperlipidemia_times

In [22]:
covariates = [
    "hypertension",
    "heart_type_disease",
    "neurological_type_disease",
    "diabetes",
    "hyperlipidemia",
]

In [23]:
for covariate in covariates:
    covariate_icd = load_icd(covariate)
    design_matrix = (
        pd.read_sql_query(
            f"""
            SELECT
                subject_id,
                COUNT(*) AS {covariate}_times
            FROM (
                SELECT
                    subject_id,
                    icd_code,
                    icd_version
                FROM mimic_hosp.diagnoses_icd
                UNION ALL
                SELECT
                    subject_id,
                    icd_code,
                    icd_version
                FROM mimic_ed.diagnosis
            ) AS all_diagnoses
            WHERE (
                icd_version = 10
                AND icd_code IN {str(covariate_icd.v10).replace(",)", ")")}
            )
            OR (
                icd_version = 9
                AND icd_code IN {str(covariate_icd.v9).replace(",)", ")")}
            )
            GROUP BY subject_id
            """,
            connection,
            index_col="subject_id",
        )
        .join(design_matrix, how="right")
        .fillna({f"{covariate}_times": 0})
        .astype({f"{covariate}_times": "int32"}, copy=False)
    )
    design_matrix[f"with_{covariate}"] = design_matrix[f"{covariate}_times"] > 0
design_matrix

Unnamed: 0_level_0,hyperlipidemia_times,diabetes_times,neurological_type_disease_times,heart_type_disease_times,hypertension_times,event_date,index_date,gender,age,with_digestive_disorders,E,T,with_hypertension,with_heart_type_disease,with_neurological_type_disease,with_diabetes,with_hyperlipidemia
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10074117,0,0,0,12,3,2118-12-31,2118-03-13,0,55,True,False,293,True,True,False,False,False
10777705,1,0,0,0,0,2150-08-13,2150-08-12,0,65,True,True,1,False,False,False,False,True
10828368,0,1,0,0,0,2185-02-07,2184-08-20,1,50,False,True,171,False,False,False,True,False
11105181,0,0,0,0,0,2174-12-31,2174-08-23,1,37,True,False,130,False,False,False,False,False
11110307,0,0,0,0,0,2156-12-31,2156-05-08,0,39,False,False,237,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19988595,0,0,2,0,2,2137-01-25,2129-11-27,1,75,True,True,2616,True,False,True,False,False
19995244,1,0,3,0,0,2111-12-31,2111-01-11,0,80,True,False,354,False,False,True,False,True
19996832,0,0,0,0,0,2179-12-31,2179-02-21,0,19,False,False,313,False,False,False,False,False
19997448,1,0,0,5,0,2122-07-22,2121-10-03,0,52,False,True,292,False,True,False,False,True


In [24]:
design_matrix.attrs["with_covariate_cols"] = [f"with_{c}" for c in covariates]
design_matrix.attrs["covariate_times_cols"] = [f"{c}_times" for c in covariates]

Sort columns in design matrix.


In [27]:
cols = [
    "gender",
    "age",
    design_matrix.attrs["predictor_col"],
    "index_date",
    "event_date",
    "T",
    "E",
    *design_matrix.attrs["with_covariate_cols"],
    *design_matrix.attrs["covariate_times_cols"],
] # sort columns
design_matrix = design_matrix[cols]
design_matrix

Unnamed: 0_level_0,gender,age,with_digestive_disorders,index_date,event_date,T,E,with_hypertension,with_heart_type_disease,with_neurological_type_disease,with_diabetes,with_hyperlipidemia,hypertension_times,heart_type_disease_times,neurological_type_disease_times,diabetes_times,hyperlipidemia_times
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10074117,0,55,True,2118-03-13,2118-12-31,293,False,True,True,False,False,False,3,12,0,0,0
10777705,0,65,True,2150-08-12,2150-08-13,1,True,False,False,False,False,True,0,0,0,0,1
10828368,1,50,False,2184-08-20,2185-02-07,171,True,False,False,False,True,False,0,0,0,1,0
11105181,1,37,True,2174-08-23,2174-12-31,130,False,False,False,False,False,False,0,0,0,0,0
11110307,0,39,False,2156-05-08,2156-12-31,237,False,False,False,False,False,False,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19988595,1,75,True,2129-11-27,2137-01-25,2616,True,True,False,True,False,False,2,0,2,0,0
19995244,0,80,True,2111-01-11,2111-12-31,354,False,False,False,True,False,True,0,0,3,0,1
19996832,0,19,False,2179-02-21,2179-12-31,313,False,False,False,False,False,False,0,0,0,0,0
19997448,0,52,False,2121-10-03,2122-07-22,292,True,False,True,False,False,True,0,5,0,0,1


Save design matrix.

In [28]:
design_matrix.to_pickle("../data/design_matrix_digestive_disorders_psychosis.pkl")