In [1]:
import subprocess
import shlex

import polars as pl
import pandas as pd

In [25]:
pheno_df = (
    pl.read_csv("data/simulated-phenotypes/Ysim_saige.txt", separator="\t")
    .select("IID", "feature_01", "feature_02", pl.col("^.+Confounder.+$"))
    .rename({
        "sharedConfounder_norm1": "c1",
        "sharedConfounder_norm2": "c2",
        "sharedConfounder_norm3": "c3",
        "independentConfounder_norm1": "c4",
        "independentConfounder_norm2": "c5"
    })
)
pheno_df.write_csv("debug/pheno.tsv", separator="\t")
pheno_df.head(2)

IID,feature_01,feature_02,c1,c2,c3,c4,c5
str,f64,f64,f64,f64,f64,f64,f64
"""ID_1""",-1.946783,-1.207065,1.079511,0.675817,0.335099,1.812403,-0.383385
"""ID_2""",-2.037948,0.431467,2.308693,-0.211148,-1.933398,1.139712,0.087484


In [32]:
(
    pheno_df
    .select(
        pl.col("feature_01", "feature_02").mean().name.suffix("_mean"),
        pl.col("feature_01", "feature_02").var().name.suffix("_var"),
    )
)

feature_01_mean,feature_02_mean,feature_01_var,feature_02_var
f64,f64,f64,f64
3.3926e-15,-1.3619e-14,1.0,1.0


In [33]:
!docker run \
    -v ./data:/data \
    -v ./debug:/debug \
    -w / \
    wzhou88/saige:1.3.0 step1_fitNULLGLMM.R \
    --phenoCol=feature_01 \
    --outputPrefix=debug/feature_01 \
    --plinkFile=data/simulated-phenotypes/Genotypes \
    --useSparseGRMtoFitNULL=FALSE \
    --phenoFile=debug/pheno.tsv \
    --covarColList=c1,c2,c3,c4,c5 \
    --sampleIDColinphenoFile=IID \
    --invNormalize=FALSE \
    --traitType=quantitative \
    --nThreads=16 \
    --IsOverwriteVarianceRatioFile=TRUE

Loading required package: optparse
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] optparse_1.7.3 SAIGE_1.3.0   

loaded via a namespace (and not attached):
[1] compiler_3.6.3     Matrix_1.6-1       Rcpp_1.0.7         getopt_1.20.3     
[5] grid_3.6.3         data.table_1.12.8  RcppParallel_5.1.7 lattice_0.20-40   
$plinkFile
[1] "data/simulated-phenotyp

In [34]:
!docker run \
    -v ./data:/data \
    -v ./debug:/debug \
    -w / \
    wzhou88/saige:1.3.0 step1_fitNULLGLMM.R \
    --phenoCol=feature_02 \
    --outputPrefix=debug/feature_02 \
    --plinkFile=data/simulated-phenotypes/Genotypes \
    --useSparseGRMtoFitNULL=FALSE \
    --phenoFile=debug/pheno.tsv \
    --covarColList=c1,c2,c3,c4,c5 \
    --sampleIDColinphenoFile=IID \
    --invNormalize=FALSE \
    --traitType=quantitative \
    --nThreads=16 \
    --IsOverwriteVarianceRatioFile=TRUE

Loading required package: optparse
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] optparse_1.7.3 SAIGE_1.3.0   

loaded via a namespace (and not attached):
[1] compiler_3.6.3     Matrix_1.6-1       Rcpp_1.0.7         getopt_1.20.3     
[5] grid_3.6.3         data.table_1.12.8  RcppParallel_5.1.7 lattice_0.20-40   
$plinkFile
[1] "data/simulated-phenotyp

In [35]:
!docker run \
    -v ./data:/data \
    -v ./debug:/debug \
    -w / \
    wzhou88/saige:1.3.0 step2_SPAtests.R \
    --GMMATmodelFile=debug/feature_01.rda \
    --varianceRatioFile=debug/feature_01.varianceRatio.txt \
    --SAIGEOutputFile=debug/feature_01.txt \
    --bedFile=data/simulated-phenotypes/Genotypes.bed \
    --bimFile=data/simulated-phenotypes/Genotypes.bim \
    --famFile=data/simulated-phenotypes/Genotypes.fam \
    --AlleleOrder=ref-first \
    --is_output_moreDetails=TRUE \
    --LOCO=FALSE

Loading required package: RhpcBLASctl
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] data.table_1.12.8   optparse_1.7.3      RhpcBLASctl_0.23-42
[4] SAIGE_1.3.0        

loaded via a namespace (and not attached):
[1] compiler_3.6.3     Matrix_1.6-1       Rcpp_1.0.7         getopt_1.20.3     
[5] grid_3.6.3         RcppParallel_5.1.7 lattice_0.20-40   
$

In [36]:
!docker run \
    -v ./data:/data \
    -v ./debug:/debug \
    -w / \
    wzhou88/saige:1.3.0 step2_SPAtests.R \
    --GMMATmodelFile=debug/feature_02.rda \
    --varianceRatioFile=debug/feature_02.varianceRatio.txt \
    --SAIGEOutputFile=debug/feature_02.txt \
    --bedFile=data/simulated-phenotypes/Genotypes.bed \
    --bimFile=data/simulated-phenotypes/Genotypes.bim \
    --famFile=data/simulated-phenotypes/Genotypes.fam \
    --AlleleOrder=ref-first \
    --is_output_moreDetails=TRUE \
    --LOCO=FALSE

Loading required package: RhpcBLASctl
R version 3.6.3 (2020-02-29)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.4 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
 [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
 [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
[10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] data.table_1.12.8   optparse_1.7.3      RhpcBLASctl_0.23-42
[4] SAIGE_1.3.0        

loaded via a namespace (and not attached):
[1] compiler_3.6.3     Matrix_1.6-1       Rcpp_1.0.7         getopt_1.20.3     
[5] grid_3.6.3         RcppParallel_5.1.7 lattice_0.20-40   
$