In [1]:
import pathlib
import shutil
import re

import polars as pl

In [2]:
def r2(true, est):
    rss = (pl.col(true) - pl.col(est)).pow(2).sum()
    tss = (pl.col(true) - pl.col(true).mean()).pow(2).sum()
    r2 = 1 - rss / tss
    return r2.alias("rsq")

# Gather GWAS

In [3]:
shutil.rmtree("data/gathered_gwas", ignore_errors=True)
gathered_gwas_path = pathlib.Path("data/gathered_gwas/")
gathered_gwas_path.mkdir()

In [4]:
def gather_gwas(paths, result_name):
    (
        pl.scan_csv(paths, separator="\t", include_file_paths="path")
        .select(
            (
                pl.col("path")
                .str.strip_prefix("data/gwas/")
                .str.strip_suffix(".glm.linear.zst")
                .str.split("/")
            ),
            pl.col("ID").alias("variant_id"),
            pl.col("T_STAT").pow(2).alias("chisq")
        )
        .select(
            pl.col("path").list.first().alias("kind"),
            pl.col("path").list.last().str.extract("^result.(.+)$").alias("phenotype"),
            "variant_id",
            "chisq",
        )
        .sink_parquet(gathered_gwas_path / result_name)
    )

In [5]:
direct_paths = [p for p in pathlib.Path("data/gwas/").glob("*/result.*.glm.linear.zst") if "approx" not in p.parent.stem]
gather_gwas(direct_paths, "direct.parquet")

In [6]:
indirect_non_anon_paths = [p for p in pathlib.Path("data/gwas/").glob("approx_[!anon]*/result.*.glm.linear.zst")]
gather_gwas(indirect_non_anon_paths, "indirect_non_anon.parquet")

In [7]:
indirect_anon_paths = [p for p in pathlib.Path("data/gwas/").glob("approx_anon_*/result.*.glm.linear.zst")]
gather_gwas(indirect_anon_paths, "indirect_anon.parquet")

# Format GWAS comparisons

1. How well does a linear approximation work for real phenotypes?
2. How much does anonymization hurt performance?
3. Overall, how good are anonymized, linearized results?

In [8]:
shutil.rmtree("data/gwas_comparison", ignore_errors=True)
gwas_comparison_path = pathlib.Path("data/gwas_comparison/")
gwas_comparison_path.mkdir()

In [9]:
# How well does a linear approximation work for real phenotypes?
# Direct vs indirect (non-anonymous)
(
    pl.scan_parquet(gathered_gwas_path / "indirect_non_anon.parquet")
    .with_columns(pl.col("kind").str.strip_prefix("approx_"))
    .join(
        pl.scan_parquet(gathered_gwas_path / "direct.parquet"),
        on=["kind", "phenotype", "variant_id"], suffix="_true"
    )
    .rename({"chisq": "chisq_est"})
    .select("kind", "phenotype", "variant_id", "chisq_true", "chisq_est")
    .sink_parquet(gwas_comparison_path / "direct_vs_indirect_non_anon.parquet")
)

In [10]:
# How much does anonymization hurt performance?
# Indirect (non-anonymous) vs Indirect (anonymous)
(
    pl.scan_parquet(gathered_gwas_path / "indirect_non_anon.parquet")
    .with_columns(pl.col("kind").str.strip_prefix("approx_"))
    .join(
        pl.scan_parquet(gathered_gwas_path / "indirect_anon.parquet")
        .with_columns(
            pl.col("kind").str.extract("_([0-9]+)$").alias("k"),
            pl.col("kind").str.extract("^approx_anon_(.+)_[0-9]+$")
        ),
        on=["kind", "phenotype", "variant_id"], suffix="_true"
    )
    .rename({"chisq": "chisq_est"})
    .select("kind", "phenotype", "k", "variant_id", "chisq_true", "chisq_est")
    .sink_parquet(gwas_comparison_path / "indirect_non_anon_vs_anon.parquet")
)

In [11]:
# How good are linearized, anonymized results?
# Direct vs Indirect (anonymous)
(
    pl.scan_parquet(gathered_gwas_path / "direct.parquet")
    .join(
        pl.scan_parquet(gathered_gwas_path / "indirect_anon.parquet")
        .with_columns(
            pl.col("kind").str.extract("_([0-9]+)$").alias("k"),
            pl.col("kind").str.extract("^approx_anon_(.+)_[0-9]+$")
        ),
        on=["kind", "phenotype", "variant_id"], suffix="_true"
    )
    .rename({"chisq": "chisq_est"})
    .select("kind", "phenotype", "k", "variant_id", "chisq_true", "chisq_est")
    .sink_parquet(gwas_comparison_path / "direct_vs_indirect_anon.parquet")
)

# Summarize GWAS results

In [12]:
shutil.rmtree("data/gwas_summary", ignore_errors=True)
gwas_summary_path = pathlib.Path("data/gwas_summary/")
gwas_summary_path.mkdir()

In [13]:
def summarize_gwas(file_name, group_vars):
    (
        pl.scan_parquet(gwas_comparison_path / file_name)
        .group_by(group_vars)
        .agg(rsq=r2("chisq_true", "chisq_est"))
        .collect() # Sink not supported yet in standard engine?
        .write_parquet(gwas_summary_path / file_name)
    )

In [14]:
summarize_gwas("direct_vs_indirect_non_anon.parquet", ["kind", "phenotype"])
summarize_gwas("indirect_non_anon_vs_anon.parquet", ["kind", "phenotype", "k"])
summarize_gwas("direct_vs_indirect_anon.parquet", ["kind", "phenotype", "k"])

# Format phenotype comparisons

In [15]:
shutil.rmtree("data/pheno_comparison", ignore_errors=True)
pheno_comparison_path = pathlib.Path("data/pheno_comparison/")
pheno_comparison_path.mkdir()

pheno_path = pathlib.Path("data/pheno")

In [16]:
# Linear approximations
# Direct vs indirect (non-anon)
phenotype_methods = ["boolean", "phecodes"]

direct_vs_indirect_non_anon_pheno_comparison_df = list()

for phenotype_method in phenotype_methods:
    direct_path = pheno_path.joinpath(phenotype_method).with_suffix(".tsv")
    direct_df = (
        pl.scan_csv(direct_path, separator="\t", null_values=["NA"])
        .unpivot(index=["#FID", "IID"], variable_name="phenotype", value_name="phenotype_true")
    )

    indirect_path = pheno_path / f"approx_{phenotype_method}.tsv"
    indirect_df = (
        pl.scan_csv(indirect_path, separator="\t", null_values=["NA"])
        .unpivot(index=["#FID", "IID"], variable_name="phenotype", value_name="phenotype_est")
    )
    this_comparison_df = (
        direct_df
        .join(indirect_df, on=["#FID", "IID", "phenotype"])
        .select(
            pl.lit(phenotype_method).alias("kind"), 
            "phenotype", "#FID", "IID", "phenotype_true", "phenotype_est",
        )
    )
    direct_vs_indirect_non_anon_pheno_comparison_df.append(this_comparison_df)

(
    pl.concat(direct_vs_indirect_non_anon_pheno_comparison_df)
    .sink_parquet(pheno_comparison_path / "direct_vs_indirect_non_anon.parquet")
)

In [17]:
# Loss of performance due to anonymization
# Indirect (non-anon) vs indirect (anon)
phenotype_methods = ["boolean", "phecodes"]

indirect_anon_vs_non_anon_pheno_comparison_df = list()

for phenotype_method in phenotype_methods:
    non_anon_path = pheno_path / f"approx_{phenotype_method}.tsv"
    non_anon_df = (
        pl.scan_csv(non_anon_path, separator="\t", null_values=["NA"])
        .unpivot(index=["#FID", "IID"], variable_name="phenotype", value_name="phenotype_true")
    )
    
    anon_paths = list(pheno_path.glob(f"approx_anon_{phenotype_method}_*.tsv"))
    for anon_path in anon_paths:
        k_match = re.search("[0-9]+$", anon_path.stem)
        if k_match is None:
            raise ValueError(f"Unable to parse {anon_path.stem}")
        k = k_match.group()
            
        anon_df = (
            pl.scan_csv(anon_path, separator="\t", null_values=["NA"])
            .unpivot(index=["#FID", "IID"], variable_name="phenotype", value_name="phenotype_est")
        )
        this_comparison_df = (
            non_anon_df
            .join(anon_df, on=["#FID", "IID", "phenotype"])
            .select(
                pl.lit(phenotype_method).alias("kind"), 
                "phenotype",
                pl.lit(k).alias("k"),
                "#FID", "IID", "phenotype_true", "phenotype_est",
            )
        )
        indirect_anon_vs_non_anon_pheno_comparison_df.append(this_comparison_df)

(
    pl.concat(indirect_anon_vs_non_anon_pheno_comparison_df)
    .sink_parquet(pheno_comparison_path / "indirect_non_anon_vs_anon.parquet")
)

In [18]:
# Overall performance
# Direct vs indirect (anon)
phenotype_methods = ["boolean", "phecodes"]

direct_vs_indirect_anon_pheno_comparison_df = list()

for phenotype_method in phenotype_methods:
    direct_path = pheno_path.joinpath(phenotype_method).with_suffix(".tsv")
    direct_df = (
        pl.scan_csv(direct_path, separator="\t", null_values=["NA"])
        .unpivot(index=["#FID", "IID"], variable_name="phenotype", value_name="phenotype_true")
    )
    
    indirect_paths = list(pheno_path.glob(f"approx_anon_{phenotype_method}_*.tsv"))
    for indirect_path in indirect_paths:
        k_match = re.search("[0-9]+$", indirect_path.stem)
        if k_match is None:
            raise ValueError(f"Unable to parse {indirect_path.stem}")
        k = k_match.group()
            
        indirect_df = (
            pl.scan_csv(indirect_path, separator="\t", null_values=["NA"])
            .unpivot(index=["#FID", "IID"], variable_name="phenotype", value_name="phenotype_est")
        )
        this_comparison_df = (
            direct_df
            .join(indirect_df, on=["#FID", "IID", "phenotype"])
            .select(
                pl.lit(phenotype_method).alias("kind"), 
                "phenotype",
                pl.lit(k).alias("k"),
                "#FID", "IID", "phenotype_true", "phenotype_est",
            )
        )
        direct_vs_indirect_anon_pheno_comparison_df.append(this_comparison_df)

(
    pl.concat(direct_vs_indirect_anon_pheno_comparison_df)
    .sink_parquet(pheno_comparison_path / "direct_vs_indirect_anon.parquet")
)

# Summarize phenotype results

In [19]:
shutil.rmtree("data/pheno_summary", ignore_errors=True)
pheno_summary_path = pathlib.Path("data/pheno_summary/")
pheno_summary_path.mkdir()

In [20]:
def summarize_pheno(file_name, group_vars):
    (
        pl.scan_parquet(pheno_comparison_path / file_name)
        .group_by(group_vars)
        .agg(rsq=r2("phenotype_true", "phenotype_est"))
        .collect() # Sink not supported yet in standard engine?
        .write_parquet(pheno_summary_path / file_name)
    )

In [21]:
summarize_pheno("direct_vs_indirect_non_anon.parquet", ["kind", "phenotype"])
summarize_pheno("indirect_non_anon_vs_anon.parquet", ["kind", "phenotype", "k"])
summarize_pheno("direct_vs_indirect_anon.parquet", ["kind", "phenotype", "k"])