In [1]:
import numpy as np
import polars as pl
import pandas as pd
import tqdm.notebook as tqdm
import pathlib
import statsmodels.api as sm
import subprocess
import shlex

In [2]:
pathlib.Path("data/pheno").mkdir(exist_ok=True, parents=True)
pathlib.Path("data/gwas").mkdir(exist_ok=True)

# Generate and format all phenotypes

In [3]:
pheno_df = pl.scan_csv("data/pheno/original.tsv", separator="\t")

features = pheno_df.drop(["FID", "IID"]).collect_schema().names()
pheno_df = pheno_df.with_columns(pl.col(features).sub(2))

k_to_anon_pheno_df = {
    int(path.stem.replace("anon_", "")): (
        pl.scan_csv(path, separator="\t")
        .drop("n_occurrences")
        .select(pl.all().sub(2))
    )
    for path in pathlib.Path("data/pheno/").glob("anon*tsv")
}
k_to_anon_pheno_df[1] = (
    pl.scan_csv("data/pheno/original.tsv", separator="\t")
    .drop(["FID", "IID"])
    .select(pl.all().sub(2))
)

print("Loaded existing data")

n_per = 100

np.random.seed(0)

# Generate AND (MIN) and OR (MAX) random phenotypes
for i in range(n_per):
    a, b = np.random.choice(features, 2, replace=False).tolist()
    pheno_df = (
        pheno_df
        .with_columns(pl.min_horizontal(pl.col(a, b)).alias(f"and_{a}_{b}"))
    )
    for k, anon_pheno_df in k_to_anon_pheno_df.items():
        k_to_anon_pheno_df[k] = (
            anon_pheno_df
            .with_columns(
                pl.min_horizontal(pl.col(a, b)).alias(f"and_{a}_{b}")
            )
        )
            
print("Generated all AND phenotypes")
    
for i in range(n_per):
    a, b = np.random.choice(features, 2, replace=False).tolist()
    pheno_df = (
        pheno_df
        .with_columns(pl.max_horizontal(pl.col(a, b)).alias(f"or_{a}_{b}"))
    )
    for k, anon_pheno_df in k_to_anon_pheno_df.items():
        k_to_anon_pheno_df[k] = (
            anon_pheno_df
            .with_columns(
                pl.max_horizontal(pl.col(a, b)).alias(f"or_{a}_{b}")
            )
        )

print("Generated all OR phenotypes")
    
for i in range(n_per):
    a, b = np.random.choice(features, 2, replace=False).tolist()
    pheno_df = (
        pheno_df
        .with_columns((pl.col(a) * pl.col(b)).alias(f"mul_{a}_{b}"))
    )
    for k, anon_pheno_df in k_to_anon_pheno_df.items():
        k_to_anon_pheno_df[k] = (
            anon_pheno_df
            .with_columns((pl.col(a) * pl.col(b)).alias(f"mul_{a}_{b}"))
        )

print("Generated all MUL phenotypes")

pheno_df = pheno_df.collect()

print("Collected main phenotypes")

X = pheno_df.select("^[A-Z][0-9]{2}$", const=1.0).to_pandas()

for k, anon_df in tqdm.tqdm(k_to_anon_pheno_df.items()):
    anon_df = anon_df.collect()
    X_anon = anon_df.select("^[A-Z][0-9]{2}$", const=1.0).to_pandas()
    assert sorted(X_anon.columns) == sorted(X.columns)
    
    Y_anon = anon_df.select("^(and|or|mul)_.+$").to_pandas()
    beta = np.linalg.lstsq(X_anon, Y_anon)[0]
    beta_df = pd.DataFrame(beta, index=X_anon.columns, columns=Y_anon.columns)
    this_pheno_df = (
        (X @ beta_df)
        .pipe(pl.DataFrame) 
        .select(pl.all().name.prefix(f"anon_{k:03}_"))
    )
    pheno_df = pl.concat([pheno_df, this_pheno_df], how="horizontal")
    
pheno_df = (
    pheno_df
    .select("FID", "IID", pl.all().exclude("FID", "IID").add(2))
)

print("Saving phenotypes")

pheno_df.write_csv("data/pheno/full_pheno.tsv", separator="\t")

pheno_df.head(0)

Loaded existing data
Generated all AND phenotypes
Generated all OR phenotypes
Generated all MUL phenotypes
Collected main phenotypes


  0%|          | 0/7 [00:00<?, ?it/s]

Saving phenotypes


FID,IID,A04,A08,A09,A15,A37,A38,A41,A49,A63,B00,B01,B02,B05,B06,B07,B08,B15,B19,B26,B27,B34,B35,B36,B37,B86,B95,B96,B97,B98,B99,C15,C16,C18,C19,C20,…,anon_001_mul_G44_R91,anon_001_mul_I22_J06,anon_001_mul_W10_E55,anon_001_mul_R54_B37,anon_001_mul_L24_M05,anon_001_mul_D63_J96,anon_001_mul_E11_R68,anon_001_mul_R30_H33,anon_001_mul_A37_F34,anon_001_mul_D23_T79,anon_001_mul_D23_E06,anon_001_mul_H36_Z43,anon_001_mul_N60_Y42,anon_001_mul_R21_L81,anon_001_mul_Z11_O00,anon_001_mul_N03_E16,anon_001_mul_I24_I89,anon_001_mul_M62_L01,anon_001_mul_J02_O48,anon_001_mul_N12_T90,anon_001_mul_K29_Z42,anon_001_mul_W45_G91,anon_001_mul_M13_B01,anon_001_mul_I83_Z57,anon_001_mul_A41_D33,anon_001_mul_S63_R17,anon_001_mul_C82_N99,anon_001_mul_S27_D64,anon_001_mul_M53_I72,anon_001_mul_Z04_N10,anon_001_mul_H53_J13,anon_001_mul_M31_D51,anon_001_mul_H26_M96,anon_001_mul_Z74_Q21,anon_001_mul_T90_D48,anon_001_mul_N48_M86,anon_001_mul_R22_W29
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64


# Run GWAS

In [4]:
command = """
plink2 \
    --pfile ../../data/geno/ukb_wb_subsampled \
    --pheno data/pheno/full_pheno.tsv \
    --glm zs hide-covar allow-no-covars \
    --threads 100 \
    --out data/gwas/result
"""
subprocess.run(shlex.split(command))

PLINK v2.0.0-a.6.0LM AVX2 Intel (11 Nov 2024)      cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to data/gwas/result.log.
Options in effect:
  --glm zs hide-covar allow-no-covars
  --out data/gwas/result
  --pfile ../../data/geno/ukb_wb_subsampled
  --pheno data/pheno/full_pheno.tsv
  --threads 100

Start time: Mon Nov 25 16:42:02 2024
1031943 MiB RAM detected, ~964752 available; reserving 515971 MiB for main
workspace.
Using up to 100 threads (change this with --threads).
429954 samples (232741 females, 197213 males; 429954 founders) loaded from
../../data/geno/ukb_wb_subsampled.psam.
10000 variants loaded from ../../data/geno/ukb_wb_subsampled.pvar.
3147 phenotypes loaded (4 binary, 3143 quantitative).
Calculating allele frequencies... done.
--glm linear regression on quantitative phenotypes #1-240: done.
--glm linear regression on quantitative phenotypes #241-480: done.
--glm linear regression on quantitative pheno

Error: All samples for --glm phenotype 'and_H15_F50' are cases.


End time: Mon Nov 25 16:58:07 2024


CompletedProcess(args=['plink2', '--pfile', '../../data/geno/ukb_wb_subsampled', '--pheno', 'data/pheno/full_pheno.tsv', '--glm', 'zs', 'hide-covar', 'allow-no-covars', '--threads', '100', '--out', 'data/gwas/result'], returncode=7)

# Gather GWAS

In [5]:
(
    pl.scan_csv("data/gwas/result*.zst", separator="\t", glob=True, include_file_paths="path")
    .select(
        pl.col("path").str.strip_prefix("data/gwas/result.").str.strip_suffix(".glm.linear.zst"),
        "ID", "BETA", "SE",
        (pl.col("BETA") / pl.col("SE")).pow(2).alias("CHISQ"),
    )
    .sink_parquet("data/gwas/full_results.parquet")
)