In [1]:
import numpy as np
import pandas as pd
import polars as pl
import tqdm.notebook as tqdm

In [2]:
allowed_phecodes_df = (
    pl.read_csv("data/phecode_map/phecode_definitions1.2.csv", dtypes={"phecode": pl.Utf8})
    .filter(~pl.col("sex").is_in({"Male", "Female"}))
    .select("phecode")
)

allowed_phecodes_df.head(2)

phecode
str
"""008"""
"""008.5"""


In [3]:
phecode_map_df = (
    pl.read_csv(
        "data/phecode_map/Phecode_map_v1_2_icd10_beta.csv", 
        dtypes={"ICD10": pl.Utf8, "PHECODE": pl.Utf8, 
                "Exl. Phecodes": pl.Utf8, "Excl. Phenotypes": pl.Utf8}
    )
    .drop("Excl. Phenotypes")
    .rename({"ICD10": "icd", "PHECODE": "phecode", "Exl. Phecodes": "control_range"})
    .join(allowed_phecodes_df, on=["phecode"])
    .unique()
)

phecode_map_df.head(2)

icd,phecode,control_range
str,str,str
"""A05.8""","""008.5""","""001-009.99"""
"""A06.6""","""008""","""001-009.99"""


In [4]:
pheno_df = pl.read_csv("../data/pheno/binary_pheno.tsv", separator="\t").drop("FID")

pheno_df.head(0)

IID,b_A01,b_A02,b_A03,b_A04,b_A05,b_A06,b_A07,b_A08,b_A09,b_A15,b_A16,b_A18,b_A31,b_A36,b_A37,b_A38,b_A40,b_A41,b_A42,b_A46,b_A48,b_A49,b_A54,b_A60,b_A63,b_A69,b_A80,b_A87,b_B00,b_B01,b_B02,b_B05,b_B06,b_B07,b_B08,b_B15,…,b_Z50,b_Z51,b_Z52,b_Z53,b_Z54,b_Z56,b_Z57,b_Z58,b_Z59,b_Z60,b_Z63,b_Z71,b_Z72,b_Z73,b_Z74,b_Z75,b_Z76,b_Z80,b_Z81,b_Z82,b_Z83,b_Z84,b_Z85,b_Z86,b_Z87,b_Z88,b_Z89,b_Z90,b_Z91,b_Z92,b_Z93,b_Z94,b_Z95,b_Z96,b_Z97,b_Z98,b_Z99
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64


In [5]:
long_pheno_df = (
    pheno_df
    .with_columns(pl.col("^b_.+$").sub(2).cast(pl.Boolean))
    .melt(id_vars=["IID"], variable_name="icd", value_name="has_icd")
    .filter("has_icd")
    .drop("has_icd")
    .with_columns(pl.col("icd").str.strip_chars("b_"))
)

long_pheno_df.head(0)

IID,icd
i64,str


In [6]:
long_case_df = (
    long_pheno_df
    .join(phecode_map_df, on="icd")
    .drop("control_range")
    .unique(["IID", "phecode"])
    .drop_nulls()
    .select("IID", "phecode")
)

long_case_df.head(0)

IID,phecode
i64,str


In [7]:
phecode_control_df = (
    phecode_map_df
    .drop("icd")
    .unique()
    .drop_nulls()
    .with_columns(control_range=pl.col("control_range").str.split(","))
    .explode("control_range")
    .with_columns(control_def=pl.col("control_range").str.split_exact("-", 1).struct.rename_fields(["control_start", "control_end"]))
    .unnest("control_def")
    .with_columns(pl.col("control_start", "control_end").fill_null(""))
    .unique()
    .sort("phecode")
)

phecode_control_df.head(2)

phecode,control_range,control_start,control_end
str,str,str,str
"""008""","""001-009.99""","""001""","""009.99"""
"""008.5""","""001-009.99""","""001""","""009.99"""


In [8]:
%%time

phecodes = sorted(long_case_df["phecode"].unique().to_list())
IIDs = pheno_df["IID"].to_list()

phecode_case_control_df = pd.DataFrame(index=IIDs, columns=phecodes, dtype=float)

for iid, phecode in long_case_df.to_numpy():
    phecode_case_control_df.loc[iid, phecode] = 1

for row in phecode_control_df.to_dicts():
    if not row["phecode"] in phecodes:
        continue 
        
    to_exclude = [
        phecode for phecode in phecodes
        if phecode >= row["control_start"] and phecode <= row["control_end"]
    ]
    control_series = (
        phecode_case_control_df
        .loc[:, to_exclude]
        .max(axis=1, skipna=True)
        .astype(float)
        .map({
            1.0: np.nan, # any cases -> excluded
            np.nan: 0,   # no cases  -> control
            0: 0         # no cases  -> control
        })
        .rename("control")
    )
    new_series = (
        pd.concat([
            phecode_case_control_df[row["phecode"]],
            control_series
        ], axis=1)
        .assign(final=lambda df: df["control"].combine_first(df[row["phecode"]]))
        .loc[:, "final"]
        .rename(row["phecode"])
    )
        
    phecode_case_control_df[row["phecode"]] = new_series

phecode_case_control_df = (
    phecode_case_control_df
    # Only keep phecodes with >= 100 cases
    .loc[:, lambda df: df.sum() >= 100]
    # Add FID, add "phecode_" prefix to all phecodes
    .rename_axis(index="IID")
    .reset_index()
    .assign(FID=lambda df: df["IID"])
    .set_index(["FID", "IID"])
    .rename(columns=lambda x: f"phecode_{x}")
    .reset_index()
)

CPU times: user 1min 1s, sys: 10.4 s, total: 1min 11s
Wall time: 1min 12s


In [9]:
phecode_case_control_df.shape

(100000, 435)

In [10]:
(
    phecode_case_control_df
    # To force Plink 2 to use linear regression, code case = 3, control = 2
    .set_index(["FID", "IID"])
    .pipe(lambda df: df + 2)
    .reset_index()
    .to_csv("data/pheno/phecodes.tsv", sep="\t", index=False, na_rep="NA", float_format="%.0f")
)