In [1]:
import pathlib
import tempfile
import shutil
import shlex
import subprocess
import os

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import tqdm.notebook as tqdm
import polars as pl
import polars.selectors as cs

import webgwas_analysis.phecodes
import webgwas_analysis.linearize

In [2]:
shutil.rmtree("data/pheno", ignore_errors=True)
pheno_path = pathlib.Path("data/pheno")
pheno_path.mkdir(exist_ok=True, parents=True)

In [3]:
def assert_is_plink_formatted(df: pl.DataFrame | pl.LazyFrame) -> None:
    if isinstance(df, pl.DataFrame):
        columns = df.columns
        assert len(columns) > 2
        code = columns[2]
        values = df[code]
    elif isinstance(df, pl.LazyFrame):
        columns = df.collect_schema().names()
        assert len(columns) > 2
        code = columns[2]
        values = df.select(code).collect()[code]
    else:
        raise ValueError(f"Unknown type supplied: {df}")
        
    assert columns[0] == "#FID"
    assert columns[1] == "IID"
    assert values.max() == 3
    assert values.min() == 2

# 3-letter, common ICD-10 codes

In [4]:
raw_icd_df = (
    pl.scan_csv("../../data/pheno_jan2024.tsv", separator="\t")
    .head(100_000)
    .select("#FID", "IID", pl.col("^b_.+$").sub(2).name.map(lambda x: x.replace("b_", "")))
)

icd_codes = (
    raw_icd_df
    .select(pl.col("^[A-Z][0-9]{2}$").sum())
    .unpivot(variable_name="icd_code", value_name="n_cases")
    .filter(pl.col("n_cases").ge(1000))
    .select("icd_code")
    .collect()
    ["icd_code"]
    .to_list()
)
print(f"Keeping only {len(icd_codes)} ICD codes after filtering")

icd_df = raw_icd_df.select(["#FID", "IID"] + [pl.col(c).add(2) for c in icd_codes])
icd_df.sink_csv(pheno_path / "icd.tsv", separator="\t", null_value="NA")

assert_is_plink_formatted(icd_df)

Keeping only 340 ICD codes after filtering


# Anonymize data

In [5]:
def anonymize(dataframe, k):
    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_dir = pathlib.Path(tmp_dir)
        input_path = tmp_dir.joinpath("input.csv")
        dataframe.drop("#FID", "IID").write_csv(input_path)
        output_path = tmp_dir.joinpath("output.csv")
        run_mdav(input_path, output_path, k=k)
        df = pl.read_csv(output_path, infer_schema_length=10000)
    return df

def run_mdav(input_path, output_path, k):
    env = os.environ.copy()
    env["RAYON_NUM_THREADS"] = "15"
    command = f"mdav -i {input_path.as_posix()} -o {output_path.as_posix()} -k {k} --just-centroids --precision 32"
    result = subprocess.run(shlex.split(command), env=env)
    result.check_returncode()

In [6]:
icd_df = pl.read_csv(pheno_path / "icd.tsv", separator="\t", null_values=["NA"])

# IMPORTANT! This is where k-values for anonymization are defined (and only here!)
# k_values = [1000]  # Use this for testing because it's WAY faster
k_values = [5, 10, 15, 20, 50, 100]

for k in tqdm.tqdm(k_values):
    anonymized_data = (
        anonymize(icd_df, k=k)
        .drop("n_occurrences")
    )
    webgwas_analysis.phecodes.check_phenotype_ranges(anonymized_data, min_=2, max_=3)
    anonymized_data.write_csv(pheno_path / f"anon_icd_{k:04}.tsv", separator="\t")

  0%|          | 0/6 [00:00<?, ?it/s]

# Phecodes

In [7]:
phecodes = webgwas_analysis.phecodes.load_definitions(
    "../../data/phecodes/phecode_definitions1.2.csv",
    "../../data/phecodes/Phecode_map_v1_2_icd9_icd10cm_09_30_2024.csv"
)
print(f"Loaded {len(phecodes)} phecodes")

phecodes = webgwas_analysis.phecodes.filter_definitions(phecodes, icd_codes)
print(f"Filtered to {len(phecodes)} phecodes")

Loaded 1873 phecodes
Filtered to 144 phecodes


In [8]:
# Non-anonymized phecodes
icd_df = (
    pl.read_csv(pheno_path / f"icd.tsv", separator="\t", null_values=["NA"])
    .with_columns("#FID", "IID", pl.all().exclude("#FID", "IID").sub(2))
)

min_n_cases = 10
phecode_df = (
    webgwas_analysis.phecodes.apply_definitions_fuzzy(
        definitions=phecodes, 
        icd_df=icd_df.drop(["#FID", "IID"])
    )
    .pipe(lambda df: webgwas_analysis.phecodes.filter_phenotypes(df, min_n_cases=min_n_cases))
    .select(pl.all().add(2))
    .pipe(lambda df: pl.concat([icd_df.select("#FID", "IID"), df], how="horizontal"))
)
print(f"Filtered to {phecode_df.shape[1] - 2} phecodes with >= {min_n_cases} cases")
assert_is_plink_formatted(phecode_df)
phecode_df.write_csv(pheno_path / "phecodes.tsv", separator="\t", null_value="NA")

Filtered to 140 phecodes with >= 10 cases


In [9]:
# Apply phecode definitions in anonymized data
for k in tqdm.tqdm(k_values):
    icd_df = (
        pl.read_csv(pheno_path / f"anon_icd_{k:04}.tsv", separator="\t", null_values=["NA"])
        .with_columns(pl.all().sub(2))
    )
    assert "#FID" not in icd_df.columns
    assert "IID" not in icd_df.columns

    min_n_cases = 10
    phecode_df = (
        webgwas_analysis.phecodes.apply_definitions_fuzzy(
            definitions=phecodes, 
            icd_df=icd_df
        )
        .select(pl.all().add(2))
    )
    webgwas_analysis.phecodes.check_phenotype_ranges(phecode_df, min_=2, max_=3)
    phecode_df.write_csv(pheno_path / f"anon_phecodes_{k:04}.tsv", separator="\t", null_value="NA")

  0%|          | 0/6 [00:00<?, ?it/s]

# Random Boolean combinations

1000 random pairs

For each pair (x, y), add `x AND y`, `x OR y`, `x AND NOT y`.

In [10]:
N_random_combos = 100

np.random.seed(0)

unique_combos = set()
boolean_phenos = list()
fuzzy_phenos = list()

while len(unique_combos) < N_random_combos:
    a, b = np.random.choice(icd_codes, size=2, replace=False)
    if (a, b) in unique_combos or (b, a) in unique_combos:
        continue

    unique_combos.add((a, b))
    
    and_pheno = pl.col(a).and_(pl.col(b)).alias(f"and_{a}_{b}")
    or_pheno = pl.col(a).or_(pl.col(b)).alias(f"or_{a}_{b}")
    not_pheno = pl.col(a).and_(pl.col(b).not_()).alias(f"not_{a}_{b}")
    boolean_phenos.extend([and_pheno, or_pheno, not_pheno])


    and_pheno = pl.min_horizontal(pl.col(a), pl.col(b)).alias(f"and_{a}_{b}")
    or_pheno = pl.max_horizontal(pl.col(a), pl.col(b)).alias(f"or_{a}_{b}")
    not_pheno = pl.min_horizontal(pl.col(a), pl.lit(1.0) - pl.col(b)).alias(f"not_{a}_{b}")
    fuzzy_phenos.extend([and_pheno, or_pheno, not_pheno])

In [11]:
boolean_df = (
    pl.scan_csv(pheno_path / "icd.tsv", separator="\t")
    .select("#FID", "IID", pl.col("^[A-Z][0-9]{2}$").sub(2))
    .select(["#FID", "IID"] + [p + 2 for p in boolean_phenos])
)
boolean_df.sink_csv(pheno_path / "boolean.tsv", separator="\t", null_value="NA")
assert_is_plink_formatted(boolean_df)

In [12]:
for k in tqdm.tqdm(k_values):
    boolean_df = (
        pl.scan_csv(pheno_path / f"anon_icd_{k:04}.tsv", separator="\t")
        .select(pl.col("^[A-Z][0-9]{2}$").sub(2))
        .select([p + 2 for p in fuzzy_phenos])
        .collect()
    )
    boolean_df.write_csv(pheno_path / f"anon_boolean_{k:04}.tsv", separator="\t", null_value="NA")

  0%|          | 0/6 [00:00<?, ?it/s]

# Approximated

Helper functions below are to reduce errors that occur from having too many variables in scope.

At a high level, what we're doing here is linearizing Phecodes and Boolean phenotypes. For 
non-anonymized data, this is just a regression like `phecode ~ icd_codes`, and we're taking the
predictions as the linearized phenotypes. For anonymized data, we're doing this regression in 
anonymized data (smaller data, no FID/IID available), then evaluating the coefficients in the 
original data to get full-size phenotypes. The only real difference is that coefficients from
regressions in anonymized data will be a bit noisier than those in the original data.

In [13]:
def approx_original(feature_path, target_path, output_path):
    feature_df = pl.scan_csv(feature_path, separator="\t", null_values=["NA"])
    target_df = pl.scan_csv(target_path, separator="\t", null_values=["NA"])
    
    features = feature_df.drop("#FID", "IID").collect_schema().names()
    targets = target_df.drop("#FID", "IID").collect_schema().names()
    
    merged_df = (
        feature_df
        .join(target_df, on=["#FID", "IID"])
        .collect()
    )
    
    approx_df = (
        webgwas_analysis.linearize.approximate_all(
            df=merged_df,
            endogs=targets,
            exogs=features,
        )
        .pipe(lambda df: pl.concat([merged_df.select("#FID", "IID"), df], how="horizontal"))
    )
    assert approx_df.shape[0] == merged_df.shape[0]
    approx_df.write_csv(output_path, separator="\t", null_value="NA")

In [14]:
def approx_anon(method, k_values):
    full_icd_df = pl.read_csv(pheno_path / "icd.tsv", separator="\t", null_values=["NA"])
    
    for k in tqdm.tqdm(k_values):
        feature_df = pl.read_csv(pheno_path / f"anon_icd_{k:04}.tsv", separator="\t", null_values=["NA"])
        target_df = pl.read_csv(pheno_path / f"anon_{method}_{k:04}.tsv", separator="\t", null_values=["NA"])
        assert feature_df.shape[0] == target_df.shape[0]
        
        features = feature_df.columns
        targets = target_df.columns
        shared_columns = set(features).intersection(targets)
        assert len(shared_columns) == 0
        
        merged_df = pl.concat([feature_df, target_df], how="horizontal")
        
        approx_df = (
            webgwas_analysis.linearize.approximate_all(
                df=merged_df,
                endogs=targets,
                exogs=features,
                eval_df=full_icd_df.drop(["#FID", "IID"]),
            )
            .pipe(lambda df: pl.concat([full_icd_df.select("#FID", "IID"), df], how="horizontal"))
        )
        assert approx_df.shape[0] == full_icd_df.shape[0]
        approx_df.write_csv(pheno_path / f"approx_anon_{method}_{k:04}.tsv", separator="\t", null_value="NA")

## Phecodes

In [15]:
approx_original(pheno_path / "icd.tsv", pheno_path / "phecodes.tsv", pheno_path / "approx_phecodes.tsv")

In [16]:
approx_anon("phecodes", k_values)

  0%|          | 0/6 [00:00<?, ?it/s]

# Approximated booleans

In [17]:
approx_original(pheno_path / "icd.tsv", pheno_path / "boolean.tsv", pheno_path / "approx_boolean.tsv")

In [18]:
approx_anon("boolean", k_values)

  0%|          | 0/6 [00:00<?, ?it/s]