In [1]:
import numpy as np
import polars as pl
import pandas as pd
import tqdm.notebook as tqdm
import pathlib
import statsmodels.api as sm
import subprocess
import shlex

In [2]:
pathlib.Path("data/pheno").mkdir(exist_ok=True, parents=True)
pathlib.Path("data/gwas-noise").mkdir(exist_ok=True)

# Generate and format all phenotypes

In [None]:
pheno_df = pl.read_csv("data/pheno/original.tsv", separator="\t")
print("Loaded existing data")

N = pheno_df.shape[0]
n_per = 2

features = sorted(pheno_df.drop(["FID", "IID"]).columns)
n_features = len(features)

np.random.seed(0)

phenotype_definitions = list()
for feature in tqdm.tqdm(features):
    for i in range(1, n_per + 1):
        std = pheno_df[feature].std() * np.random.beta(a=2, b=1)
        noise = np.random.normal(scale=std, size=N)
        new_phenotype = (pl.col(feature) + noise).alias(f"{feature}_{i}")
        phenotype_definitions.append(new_phenotype)

pheno_df = (
    pheno_df
    .lazy()
    .select("FID", "IID", *features, *phenotype_definitions)
    .collect()
)

print("Saving phenotypes")

pheno_df.write_csv("data/pheno/noise_pheno.tsv", separator="\t")

pheno_df.head(0)

# Gather phenotype fits

In [79]:
def rsq(y, yhat):
    rss = (y - yhat).pow(2).sum()
    tss = (y - y.mean()).pow(2).sum().replace(0.0, 1.0)
    r2 = 1.0 - rss / tss
    return r2

    
pheno_fit_df = (
    pheno_df
    .lazy()
    .select([
        rsq(pl.col(feature), pl.col(f"{feature}_{i+1}")).alias(f"{feature}_{i+1}")
        for feature in features for i in range(n_per)
    ])
    .unpivot(value_name="rsq")
    .with_columns(
        pl.col("variable").str.extract("^([A-Z][0-9]{2})").alias("underlying"),
        pl.col("variable").str.extract("_([0-9])$").cast(pl.Int64).fill_null(0).alias("i"),
    )
    .select("underlying", "i", "rsq")
    .collect()
)

pheno_fit_df.write_parquet("data/noise_pheno_fit.parquet")

pheno_fit_df.head(2)

underlying,i,rsq
str,i64,f64
"""A04""",1,0.249769
"""A04""",2,0.073273


# Run GWAS

In [83]:
command = """
plink2 \
    --pfile ../../data/geno/ukb_wb_subsampled \
    --pheno data/pheno/noise_pheno.tsv \
    --no-input-missing-phenotype \
    --glm zs hide-covar allow-no-covars \
    --threads 100 \
    --out data/gwas-noise/result
"""
subprocess.run(shlex.split(command))

PLINK v2.0.0-a.6.0LM AVX2 Intel (11 Nov 2024)      cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to data/gwas-noise/result.log.
Options in effect:
  --glm zs hide-covar allow-no-covars
  --no-input-missing-phenotype
  --out data/gwas-noise/result
  --pfile ../../data/geno/ukb_wb_subsampled
  --pheno data/pheno/noise_pheno.tsv
  --threads 100

Start time: Tue Nov 26 09:40:31 2024
1031943 MiB RAM detected, ~784971 available; reserving 515971 MiB for main
workspace.
Using up to 100 threads (change this with --threads).
429954 samples (232741 females, 197213 males; 429954 founders) loaded from
../../data/geno/ukb_wb_subsampled.psam.
10000 variants loaded from ../../data/geno/ukb_wb_subsampled.pvar.
2241 quantitative phenotypes loaded.
Calculating allele frequencies... done.
--glm linear regression on quantitative phenotypes #1-240: done.
--glm linear regression on quantitative phenotypes #241-480: done.
--glm linear regre

CompletedProcess(args=['plink2', '--pfile', '../../data/geno/ukb_wb_subsampled', '--pheno', 'data/pheno/noise_pheno.tsv', '--no-input-missing-phenotype', '--glm', 'zs', 'hide-covar', 'allow-no-covars', '--threads', '100', '--out', 'data/gwas-noise/result'], returncode=0)

# Gather GWAS

In [84]:
(
    pl.scan_csv("data/gwas-noise/result*.zst", separator="\t", glob=True, include_file_paths="path")
    .select(
        pl.col("path").str.strip_prefix("data/gwas-noise/result.").str.strip_suffix(".glm.linear.zst"),
        "ID", "BETA", "SE",
        (pl.col("BETA") / pl.col("SE")).pow(2).alias("CHISQ"),
    )
    .sink_parquet("data/gwas-noise/noise_results.parquet")
)

In [85]:
gwas_fit_df = (
    pl.scan_parquet("data/gwas-noise/noise_results.parquet")
    .select("path", "ID", "CHISQ")
    .collect()
    .pivot(on="path", values="CHISQ", index="ID")
    .lazy()
    .select([
        rsq(pl.col(feature), pl.col(f"{feature}_{i+1}")).alias(f"{feature}_{i+1}")
        for feature in features for i in range(n_per)
    ])
    .unpivot(value_name="rsq")
    .with_columns(
        pl.col("variable").str.extract("^([A-Z][0-9]{2})").alias("underlying"),
        pl.col("variable").str.extract("_([0-9])$").cast(pl.Int64).fill_null(0).alias("i"),
    )
    .select("underlying", "i", "rsq")
    .collect()
)

gwas_fit_df.write_parquet("data/noise_gwas_fit.parquet")

gwas_fit_df.head(2)

underlying,i,rsq
str,i64,f64
"""A04""",1,0.156914
"""A04""",2,0.060265
