In [12]:
import concurrent.futures
import pathlib
import shlex
import subprocess

import tqdm.notebook as tqdm

# GWAS

In [3]:
geno_path = (
    # "/data1/home/mnz2108/"  # mimir
    "/data2/michael/"    # eir
    "data_resources/ukbiobank/hapmap3_genotypes/hapmap3_variants_white_british"
)

In [5]:
command = f"""
plink2 \
    --pfile {geno_path} \
    --pheno data/pheno/top20.tsv \
    --covar ../../data/pheno/covar.tsv \
    --glm hide-covar \
    --threads 55 \
    --out data/gwas/plink
"""

result = subprocess.run(shlex.split(command))

PLINK v2.00a6LM 64-bit Intel (18 Mar 2024)     www.cog-genomics.org/plink/2.0/
(C) 2005-2024 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to data/gwas/plink.log.
Options in effect:
  --covar ../../data/pheno/covar.tsv
  --glm hide-covar
  --out data/gwas/plink
  --pfile /data2/michael/data_resources/ukbiobank/hapmap3_genotypes/hapmap3_variants_white_british
  --pheno data/pheno/top20.tsv
  --threads 55

Start time: Thu Apr 11 10:14:56 2024
193175 MiB RAM detected, ~160491 available; reserving 96587 MiB for main
workspace.
Using up to 55 threads (change this with --threads).
429954 samples (232741 females, 197213 males; 429954 founders) loaded from
/data2/michael/data_resources/ukbiobank/hapmap3_genotypes/hapmap3_variants_white_british.psam.
1166145 variants loaded from
/data2/michael/data_resources/ukbiobank/hapmap3_genotypes/hapmap3_variants_white_british.pvar.
20 quantitative phenotypes loaded.
12 covariates loaded from ../../data/pheno/covar.tsv.
Calculat

# Format

In [13]:
# Reformat all the summary statistics for LDAK
command = """
find data/gwas/ -type f -name *.linear -exec sumher_rs fmt -g {} -o {}.summaries \;
"""
subprocess.run(shlex.split(command))

Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!
Success!


CompletedProcess(args=['find', 'data/gwas/', '-type', 'f', '-name', '*.linear', '-exec', 'sumher_rs', 'fmt', '-g', '{}', '-o', '{}.summaries', ';'], returncode=0)

# Heritability

In [24]:
def make_h2_command(path):
    output_stem = path.parent.parent.joinpath("h2").joinpath(path.name)
    return f"""
    ldak5.2.linux \
        --tagfile /data2/michael/data_resources/ldak/bld.ldak.hapmap.gbr.tagging \
        --summary {path} \
        --check-sums NO \
        --cutoff 0.01 \
        --sum-hers {output_stem}
    """

def run_h2(path):
    return subprocess.run(shlex.split(make_h2_command(path)), capture_output=True)

In [25]:
gwas_results = sorted(pathlib.Path("data/gwas/").glob("*.summaries"))
print(len(gwas_results))

with concurrent.futures.ThreadPoolExecutor() as e:
    results = list(tqdm.tqdm(e.map(run_h2, gwas_results), total=len(gwas_results)))

20


  0%|          | 0/20 [00:00<?, ?it/s]

# Genetic correlation

In [32]:
def make_rg_command(path1, path2):
    output_stem = path1.parent.parent.joinpath("rg").joinpath(path1.name + "." + path2.name)
    return f"""
    ldak5.2.linux \
        --tagfile /data2/michael/data_resources/ldak/ldak.thin.hapmap.gbr.tagging \
        --summary {path1} \
        --summary2 {path2} \
        --check-sums NO \
        --cutoff 0.01 \
        --sum-cors {output_stem}
    """

def run_rg(pair):
    path1, path2 = pair
    return subprocess.run(shlex.split(make_rg_command(path1, path2)), capture_output=True)

In [33]:
pairs = [(path1, path2) for path1 in gwas_results for path2 in gwas_results if path1 < path2]
print(len(pairs))

with concurrent.futures.ThreadPoolExecutor() as e:
    results = list(tqdm.tqdm(e.map(run_rg, pairs), total=len(pairs)))

190


  0%|          | 0/190 [00:00<?, ?it/s]