In [1]:
import pathlib
import shlex
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
import seaborn as sns
import sklearn.linear_model
import tqdm.notebook as tqdm

In [2]:
pathlib.Path("data/gwas").mkdir(exist_ok=True, parents=True)
pathlib.Path("data/gwas/feature").mkdir(exist_ok=True, parents=True)
pathlib.Path("data/gwas/direct").mkdir(exist_ok=True, parents=True)
pathlib.Path("data/indirect-betas").mkdir(exist_ok=True, parents=True)

In [3]:
(
    pl.concat([
            pl.scan_csv("data/pheno/original.tsv", separator="\t").select("FID", "IID"),
            pl.scan_csv("data/pheno/rand_original.tsv", separator="\t"),
        ], how="horizontal"
    )
    .collect()
    .write_csv("data/pheno/rand_original_gwas.tsv", separator="\t")
)

In [4]:
command = """
plink2 \
  --pfile ../../data/geno/hapmap3_variants_white_british \
  --thin-count 10000 \
  --make-pgen \
  --out ../../data/geno/ukb_wb_subsampled
"""
result = subprocess.run(shlex.split(command))

PLINK v2.00a6 M1 (18 Aug 2024)                 www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to ../../data/geno/ukb_wb_subsampled.log.
Options in effect:
  --make-pgen
  --out ../../data/geno/ukb_wb_subsampled
  --pfile ../../data/geno/hapmap3_variants_white_british
  --thin-count 10000

Start time: Fri Nov 15 16:32:04 2024
65536 MiB RAM detected; reserving 32768 MiB for main workspace.
Using up to 16 threads (change this with --threads).
429954 samples (232741 females, 197213 males; 429954 founders) loaded from
../../data/geno/hapmap3_variants_white_british.psam.
1166145 variants loaded from
../../data/geno/hapmap3_variants_white_british.pvar.
Note: No phenotype data present.
--thin-count: 1156145 variants removed (10000 remaining).
10000 variants remaining after main filters.
Writing ../../data/geno/ukb_wb_subsampled.psam ... done.
Writing ../../data/geno/ukb_wb_subsampled.pvar ... done.
Writing ../../data/geno/

In [5]:
command = """
plink2 \
  --pfile ../../data/geno/ukb_wb_subsampled \
  --pheno data/pheno/original.tsv \
  --glm allow-no-covars hide-covar zs \
  --out data/gwas/feature/result
"""
result = subprocess.run(shlex.split(command))

PLINK v2.00a6 M1 (18 Aug 2024)                 www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to data/gwas/feature/result.log.
Options in effect:
  --glm allow-no-covars hide-covar zs
  --out data/gwas/feature/result
  --pfile ../../data/geno/ukb_wb_subsampled
  --pheno data/pheno/original.tsv

Start time: Fri Nov 15 16:32:09 2024
65536 MiB RAM detected; reserving 32768 MiB for main workspace.
Using up to 16 threads (change this with --threads).
429954 samples (232741 females, 197213 males; 429954 founders) loaded from
../../data/geno/ukb_wb_subsampled.psam.
10000 variants loaded from ../../data/geno/ukb_wb_subsampled.pvar.
500 phenotypes loaded (16 binary, 484 quantitative).
Calculating allele frequencies... done.
--glm linear regression on quantitative phenotypes #1-240: done.
--glm linear regression on quantitative phenotypes #241-480: done.
--glm linear regression on quantitative phenotypes #481-484: done.
Resu

Error: All samples for --glm phenotype 'q_5074_4' are controls.


In [6]:
command = """
plink2 \
  --pfile ../../data/geno/ukb_wb_subsampled \
  --pheno data/pheno/rand_original_gwas.tsv \
  --glm allow-no-covars hide-covar zs \
  --no-input-missing-phenotype \
  --out data/gwas/direct/result
"""
result = subprocess.run(shlex.split(command))

PLINK v2.00a6 M1 (18 Aug 2024)                 www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to data/gwas/direct/result.log.
Options in effect:
  --glm allow-no-covars hide-covar zs
  --no-input-missing-phenotype
  --out data/gwas/direct/result
  --pfile ../../data/geno/ukb_wb_subsampled
  --pheno data/pheno/rand_original_gwas.tsv

Start time: Fri Nov 15 16:32:49 2024
65536 MiB RAM detected; reserving 32768 MiB for main workspace.
Using up to 16 threads (change this with --threads).
429954 samples (232741 females, 197213 males; 429954 founders) loaded from
../../data/geno/ukb_wb_subsampled.psam.
10000 variants loaded from ../../data/geno/ukb_wb_subsampled.pvar.
1000 phenotypes loaded (41 binary, 959 quantitative).
Calculating allele frequencies... done.
--glm linear regression on quantitative phenotypes #1-240: done.
--glm linear regression on quantitative phenotypes #241-480: done.
--glm linear regression on quan

Error: All samples for --glm phenotype 'mul_q_5074_7_q_3761_0' are controls.
