In [1]:
import pathlib

import pandas as pd
import polars as pl

# OLS univariate

In [2]:
direct_univariate_paths = list(pathlib.Path("data/direct/ols_univariate").glob("plink.PROJ*.glm.linear.zst"))

direct_univariate_df = (
    pd.concat([
        pd.read_csv(path, sep="\s+", usecols=["ID", "BETA", "SE", "T_STAT", "P", "OBS_CT"])
            .assign(phenotype_id=path.stem.replace("plink.", "").replace(".glm.linear", ""))
        for path in direct_univariate_paths
    ])
    .pipe(pl.DataFrame)
    .select("phenotype_id", variant_id="ID", beta="BETA", std_error="SE", 
            chisq=pl.col("T_STAT").pow(2), log10p=pl.col("P").log(10).mul(-1), sample_size="OBS_CT")
)

direct_univariate_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size
str,str,f64,f64,f64,f64,i64
"""PROJ002""","""1:761147""",-0.010403,0.0154167,0.455366,0.301204,100000
"""PROJ002""","""1:768448""",0.0180045,0.0179621,1.004726,0.500077,100000


In [3]:
indirect_univariate_df = (
    pl.read_csv("data/igwas/univariate.tsv.zst", separator="\t")
    .select("phenotype_id", "variant_id", "beta", "std_error", 
            pl.col("t_stat").pow(2).alias("chisq"), pl.col("p_value").alias("log10p"), 
            "sample_size")
)

indirect_univariate_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size
str,str,f64,f64,f64,f64,i64
"""PROJ001""","""1:761147""",0.027324,0.024054,1.29039,0.591796,100000
"""PROJ001""","""1:768448""",-0.029101,0.028025,1.078261,0.5241989,100000


In [4]:
univariate_comparison_df = (
    direct_univariate_df
    .join(indirect_univariate_df, on=["phenotype_id", "variant_id"], suffix="_indirect")
)

univariate_comparison_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size,beta_indirect,std_error_indirect,chisq_indirect,log10p_indirect,sample_size_indirect
str,str,f64,f64,f64,f64,i64,f64,f64,f64,f64,i64
"""PROJ001""","""1:761147""",0.0272141,0.0240502,1.280428,0.588677,99999,0.027324,0.024054,1.29039,0.591796,100000
"""PROJ001""","""1:768448""",-0.029204,0.0280211,1.086181,0.526773,99999,-0.029101,0.028025,1.078261,0.5241989,100000


In [5]:
univariate_comparison_df.write_parquet("plot_data/univariate.parquet")

# OLS multivariate

In [6]:
direct_multivariate_paths = list(pathlib.Path("data/direct/ols_multivariate").glob("plink.PROJ*.glm.linear.zst"))

direct_multivariate_df = (
    pd.concat([
        pd.read_csv(path, sep="\s+", usecols=["ID", "BETA", "SE", "T_STAT", "P", "OBS_CT"])
            .assign(phenotype_id=path.stem.replace("plink.", "").replace(".glm.linear", ""))
        for path in direct_multivariate_paths
    ])
    .pipe(pl.DataFrame)
    .select("phenotype_id", variant_id="ID", beta="BETA", std_error="SE", 
            chisq=pl.col("T_STAT").pow(2), log10p=pl.col("P").log(10).mul(-1), sample_size="OBS_CT")
)

direct_multivariate_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size
str,str,f64,f64,f64,f64,i64
"""PROJ002""","""1:761147""",-0.01085,0.0152572,0.505731,0.321487,100000
"""PROJ002""","""1:768448""",0.0213846,0.0177671,1.448677,0.640654,100000


In [7]:
indirect_multivariate_df = (
    pl.read_csv("data/igwas/multivariate.tsv.zst", separator="\t")
    .select("phenotype_id", "variant_id", "beta", "std_error", 
            pl.col("t_stat").pow(2).alias("chisq"), pl.col("p_value").alias("log10p"), 
            "sample_size")
)

indirect_multivariate_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size
str,str,f64,f64,f64,f64,i64
"""PROJ001""","""1:761147""",0.019287,0.0234371,0.677203,0.3866302,100000
"""PROJ001""","""1:768448""",-0.020533,0.027293,0.565981,0.344994,100000


In [8]:
multivariate_comparison_df = (
    direct_multivariate_df
    .join(indirect_multivariate_df, on=["phenotype_id", "variant_id"], suffix="_indirect")
)

multivariate_comparison_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size,beta_indirect,std_error_indirect,chisq_indirect,log10p_indirect,sample_size_indirect
str,str,f64,f64,f64,f64,i64,f64,f64,f64,f64,i64
"""PROJ001""","""1:761147""",0.0191819,0.0234841,0.667169,0.382955,99999,0.019287,0.0234371,0.677203,0.3866302,100000
"""PROJ001""","""1:768448""",-0.020624,0.0273474,0.568747,0.346056,99999,-0.020533,0.027293,0.565981,0.344994,100000


In [9]:
multivariate_comparison_df.write_parquet("plot_data/multivariate.parquet")

# Regenie

In [10]:
indirect_regenie_df = (
    pl.read_csv("data/igwas/regenie.tsv.zst", separator="\t")
    .select("phenotype_id", "variant_id", "beta", "std_error", 
            pl.col("t_stat").pow(2).alias("chisq"), pl.col("p_value").alias("log10p"), 
            "sample_size")
)

indirect_regenie_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size
str,str,f64,f64,f64,f64,i64
"""PROJ001""","""1:761147""",-0.026962,0.019455,1.920618,0.780437,100000
"""PROJ001""","""1:768448""",-0.009058,0.022656,0.15983,0.161583,100000


In [11]:
direct_regenie_paths = list(pathlib.Path("data/direct/regenie").glob("direct_PROJ*.regenie.gz"))

direct_regenie_df = (
    pd.concat([
        pd.read_csv(path, sep="\s+", usecols=["ID", "N", "BETA", "SE", "CHISQ", "LOG10P"])
            .assign(phenotype_id=path.stem.replace("direct_", "").replace(".regenie", ""))
        for path in direct_regenie_paths
    ])
    .pipe(pl.DataFrame)
    .select("phenotype_id", variant_id="ID", beta="BETA", std_error="SE", 
            chisq="CHISQ", log10p="LOG10P", sample_size="N")
)

direct_regenie_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size
str,str,f64,f64,f64,f64,i64
"""PROJ001""","""1:761147""",-0.02854,0.0230651,1.53107,0.665644,100000
"""PROJ001""","""1:768448""",-0.009317,0.0268595,0.120321,0.137459,100000


In [12]:
regenie_comparison_df = (
    direct_regenie_df
    .join(indirect_regenie_df, on=["phenotype_id", "variant_id"], suffix="_indirect")
)

regenie_comparison_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size,beta_indirect,std_error_indirect,chisq_indirect,log10p_indirect,sample_size_indirect
str,str,f64,f64,f64,f64,i64,f64,f64,f64,f64,i64
"""PROJ001""","""1:761147""",-0.02854,0.0230651,1.53107,0.665644,100000,-0.026962,0.019455,1.920618,0.780437,100000
"""PROJ001""","""1:768448""",-0.009317,0.0268595,0.120321,0.137459,100000,-0.009058,0.022656,0.15983,0.161583,100000


In [13]:
regenie_comparison_df.write_parquet("plot_data/regenie.parquet")

# FastGWA

In [14]:
indirect_fastgwa_df = (
    pl.read_csv("data/igwas/fastgwa.tsv.zst", separator="\t")
    .select("phenotype_id", "variant_id", "beta", "std_error", 
            pl.col("t_stat").pow(2).alias("chisq"), pl.col("p_value").alias("log10p"), 
            "sample_size")
)

indirect_fastgwa_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size
str,str,f64,f64,f64,f64,i64
"""PROJ001""","""1:761147""",-0.016902,0.023544,0.515362,0.325297,99194
"""PROJ001""","""1:768448""",-0.020637,0.027213,0.575128,0.3484988,100000


In [15]:
direct_fastgwa_paths = list(pathlib.Path("data/direct/fastgwa").glob("PROJ*.fastGWA"))

direct_fastgwa_df = (
    pl.concat([
        pl.read_csv(path, separator="\t", columns=["SNP", "N", "BETA", "SE", "P"])
        .with_columns(phenotype_id=pl.lit(path.stem.replace(".fastGWA", "")))
        for path in direct_fastgwa_paths
    ])
    .select("phenotype_id", variant_id="SNP", beta="BETA", std_error="SE",
            chisq=(pl.col("BETA")/pl.col("SE")).pow(2), log10p=pl.col("P").log(10), 
            sample_size="N")
)

direct_fastgwa_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size
str,str,f64,f64,f64,f64,i64
"""PROJ001""","""1:761147""",-0.016955,0.0236513,0.513884,-0.324714,99194
"""PROJ001""","""1:768448""",-0.02052,0.0273366,0.563474,-0.344033,100000


In [16]:
fastgwa_comparison_df = (
    direct_fastgwa_df
    .join(indirect_fastgwa_df, on=["phenotype_id", "variant_id"], suffix="_indirect")
)

fastgwa_comparison_df.head(2)

phenotype_id,variant_id,beta,std_error,chisq,log10p,sample_size,beta_indirect,std_error_indirect,chisq_indirect,log10p_indirect,sample_size_indirect
str,str,f64,f64,f64,f64,i64,f64,f64,f64,f64,i64
"""PROJ001""","""1:761147""",-0.016955,0.0236513,0.513884,-0.324714,99194,-0.016902,0.023544,0.515362,0.325297,99194
"""PROJ001""","""1:768448""",-0.02052,0.0273366,0.563474,-0.344033,100000,-0.020637,0.027213,0.575128,0.3484988,100000


In [17]:
fastgwa_comparison_df.write_parquet("plot_data/fastgwa.parquet")