In [1]:
import os
import pathlib
import shlex
import subprocess
import tempfile

import polars as pl
import tqdm.notebook as tqdm

In [2]:
pheno_dir = pathlib.Path("data/pheno")
pheno_dir.mkdir(exist_ok=True, parents=True)

In [3]:
raw_pheno_df = pl.read_csv("../../data/pheno_jan2024.tsv", separator="\t")

raw_pheno_df.head(0)

#FID,IID,q_46_0,q_47_0,q_48_0,q_49_0,q_50_0,q_51_0,q_78_0,q_102_0,q_102_1,q_134_0,q_135_0,q_136_0,q_137_0,q_699_0,q_709_0,q_757_0,q_767_0,q_777_0,q_796_0,q_845_0,q_864_0,q_874_0,q_884_0,q_894_0,q_904_0,q_914_0,q_1050_0,q_1060_0,q_1070_0,q_1080_0,q_1090_0,q_1160_0,q_1269_0,q_1279_0,q_1289_0,…,b_Z50,b_Z51,b_Z52,b_Z53,b_Z54,b_Z56,b_Z57,b_Z58,b_Z59,b_Z60,b_Z63,b_Z71,b_Z72,b_Z73,b_Z74,b_Z75,b_Z76,b_Z80,b_Z81,b_Z82,b_Z83,b_Z84,b_Z85,b_Z86,b_Z87,b_Z88,b_Z89,b_Z90,b_Z91,b_Z92,b_Z93,b_Z94,b_Z95,b_Z96,b_Z97,b_Z98,b_Z99
i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64


In [4]:
icd_df = (
    raw_pheno_df
    .head(300_000)
    .select(pl.col("#FID").alias("FID"), "IID", pl.col("^b_[A-Z][0-9]{2}$").name.map(lambda x: x.strip("b_")))
)

icd_codes_to_keep = (
    icd_df
    .drop("FID", "IID")
    .select(pl.all().sub(2).sum())
    .unpivot(value_name="n_cases")
    .filter(pl.col("n_cases").ge(500))
    ["variable"]
    .to_list()
)

icd_df = (
    icd_df
    .select("FID", "IID", *icd_codes_to_keep)
)
icd_df.write_csv(pheno_dir.joinpath("original.tsv"), separator="\t")
print(icd_df.shape)
icd_df.head(0)

(300000, 749)


FID,IID,A04,A08,A09,A15,A37,A38,A41,A49,A63,B00,B01,B02,B05,B06,B07,B08,B15,B19,B26,B27,B34,B35,B36,B37,B86,B95,B96,B97,B98,B99,C15,C16,C18,C19,C20,…,Z40,Z41,Z42,Z43,Z45,Z46,Z47,Z48,Z50,Z51,Z53,Z57,Z60,Z63,Z71,Z72,Z73,Z74,Z75,Z80,Z82,Z83,Z85,Z86,Z87,Z88,Z89,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64


In [5]:
def anonymize(dataframe, k):
    with tempfile.TemporaryDirectory() as tmp_dir:
        tmp_dir = pathlib.Path(tmp_dir)
        input_path = tmp_dir.joinpath("input.csv")
        dataframe.drop("FID", "IID").write_csv(input_path)
        output_path = tmp_dir.joinpath("output.csv")
        run_mdav(input_path, output_path, k=k)
        df = pl.read_csv(output_path, infer_schema_length=10000)
    return df

def run_mdav(input_path, output_path, k):
    env = os.environ.copy()
    env["RAYON_NUM_THREADS"] = "110"
    command = f"mdav -i {input_path.as_posix()} -o {output_path.as_posix()} -k {k} --just-centroids --precision 32"
    result = subprocess.run(shlex.split(command), env=env)
    result.check_returncode()

In [6]:
k_values = [50, 100]
# 5, 10, 15, 20, 

for k in tqdm.tqdm(k_values):
    anonymized_data = anonymize(icd_df, k=k)
    anonymized_data.write_csv(pheno_dir.joinpath(f"anon_{k:03}.tsv"), separator="\t")

  0%|          | 0/2 [00:00<?, ?it/s]