In [1]:
import pathlib

import pandas as pd
import polars as pl
import seaborn as sns
import sklearn.linear_model
import tqdm.notebook as tqdm

# Load all GWAS summary statistics

In [2]:
root = pathlib.Path("data/gwas")

full_gwas_df = pl.DataFrame()

for i in tqdm.tnrange(1, 1001):
    random_path = root.joinpath(f"plink.R{i}.glm.linear.zst")
    predicted_path = root.joinpath(f"plink.PR{i}.glm.linear.zst")
    assert random_path.exists(), random_path
    assert predicted_path.exists(), predicted_path

    random_df = pl.read_csv(random_path, separator="\t", columns=["ID", "P"])
    predicted_df = pl.read_csv(predicted_path, separator="\t", columns=["ID", "P"])
    merged_df = (
        random_df
        .join(predicted_df, on=["ID"], suffix="_predicted")
        .with_columns(pl.lit(i).alias("phenotype_id"))
    )
    full_gwas_df = pl.concat([full_gwas_df, merged_df])

full_gwas_df = (
    full_gwas_df
    .select("phenotype_id", pl.col("ID").alias("variant_id"), pl.col("P").alias("P_true"), "P_predicted")
)

full_gwas_df.write_parquet("data/full_gwas.parquet")

print(full_gwas_df.shape)

full_gwas_df.head(2)

  0%|          | 0/1000 [00:00<?, ?it/s]

(100000000, 4)


phenotype_id,variant_id,P_true,P_predicted
i32,str,f64,f64
1,"""1:760912""",0.1454,0.173682
1,"""1:853954""",0.0505077,0.0477667
