In [1]:
import pathlib

import pandas as pd
import polars as pl
import tqdm

# Subsample direct results

In [2]:
direct_paths = list(pathlib.Path("data/gwas").glob("plink.b_*.glm.linear.zst"))

len(direct_paths)

1238

In [3]:
sampled_variant_df = (
    pl.read_csv(direct_paths[0], separator="\t", columns=["ID"])
    .rename({"ID": "variant_id"})
    .sample(n=1000, seed=0)
)

sampled_variant_df.head(2)

variant_id
str
"""7:50078299"""
"""7:29500502"""


In [4]:
full_direct_df = list()

for path in tqdm.tqdm(direct_paths):
    df = (
        pl.read_csv(path, separator="\t", columns=["ID", "P"])
        .join(sampled_variant_df, left_on="ID", right_on="variant_id")
        .select(
            phenotype_id=pl.lit(path.stem.replace("plink.", "").replace(".glm.linear", "")),
            variant_id="ID",
            p_value=pl.col("P").log(10).mul(-1),
        )
    )
    full_direct_df.append(df)

full_direct_df = pl.concat(full_direct_df)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1238/1238 [08:51<00:00,  2.33it/s]


In [5]:
full_direct_df.write_parquet("plot_data/direct_sampled.parquet")

# Subsample indirect results

In [6]:
igwas_paths = list(pathlib.Path("data/igwas").glob("pca*.tsv.zst"))

igwas_paths

[PosixPath('data/igwas/pca_0.25.tsv.zst'),
 PosixPath('data/igwas/pca_0.1.tsv.zst'),
 PosixPath('data/igwas/pca_0.5.tsv.zst'),
 PosixPath('data/igwas/pca_0.75.tsv.zst'),
 PosixPath('data/igwas/pca_1.0.tsv.zst'),
 PosixPath('data/igwas/pca_0.9.tsv.zst')]

In [7]:
for path in igwas_paths:
    igwas_reader = pd.read_csv(
        path, sep="\t", 
        usecols=["phenotype_id", "variant_id", "p_value"], 
        dtype={"phenotype_id": str, "variant_id": str, "p_value": float}, 
        chunksize=500_000
    )
    
    indirect_df = list()
    for df in tqdm.tqdm(igwas_reader):
        df = pl.DataFrame(df).join(sampled_variant_df, on=["variant_id"])
        indirect_df.append(df)
    indirect_df = pl.concat(indirect_df)
    
    output_path = (
        pathlib.Path("plot_data").joinpath("indirect_sampled_" + path.name)
        .with_suffix("").with_suffix(".parquet")
    )
    indirect_df.write_parquet(output_path)

1238it [12:43,  1.62it/s]
1238it [13:06,  1.57it/s]
1238it [13:35,  1.52it/s]
1238it [12:28,  1.65it/s]
1238it [13:19,  1.55it/s]