## Sgkit GWAS Workflow Example

This simulated workflow generates data to emulate large-scale GWAS regressions.

See https://github.com/pystatgen/sgkit/issues/438 for more details.

In [16]:
import os
import numpy as np
import xarray as xr
import dask.array as da
from dask.array import stats
from dask.diagnostics import ProgressBar
from dask.distributed import Client, performance_report
import fsspec
fs = fsspec.filesystem('gs')

In [17]:
# uncomment out to run dask distributed (on a single machine)
client = Client(n_workers=1)
os.makedirs("reports", exist_ok=True)

In [18]:
# Debug settings
n = 1000 # Number of variants (i.e. genomic locations)
m = 1000 # Number of individuals (i.e. people)
c = 3    # Number of covariates (i.e. confounders)

# Representative settings for single (small) UK Biobank chromosome:
# n, m, c = 141910, 365941, 25

# XY chromosome
n, m, c = 8444, 365941, 25

path = f"gs://rs-gwas-benchmark/sim_ds_{n}_{m}_{c}.zarr"
path

'gs://rs-gwas-benchmark/sim_ds_8444_365941_25.zarr'

In [19]:
# Create the dataset on cloud storage if not already present
if not fs.exists(path):
    with ProgressBar():
        rs = da.random.RandomState(0)
        XL, BL = rs.randint(0, 128, size=(n, m), chunks=(5216, 5792)), da.array([1] + [0] * (m - 1))
        XC, BC = rs.normal(size=(m, c)), rs.normal(size=(c,))
        Y = (XL * BL).sum(axis=0) + XC @ BC + rs.normal(scale=.001, size=m)
        ds = xr.Dataset(dict(
            # This is a proxy for discretized allele dosages (between 0 and 2)
            XL=(('variants', 'samples'), (2 * XL / 127).astype('f2')),
            # This value represents covariates for samples, e.g. age, sex, ancestry, etc.
            XC=(('samples', 'covariates'), XC.astype('f4')),
            # This is the outcome on which all variant data will be regressed separately
            Y=(('samples', 'outcomes'), Y[:, np.newaxis].astype('f4')),
        ))
        print(f'Saving simulated data to {path}')
        ds.to_zarr(fsspec.get_mapper(path), mode='w', consolidated=True)

In [20]:
ds = xr.open_zarr(fsspec.get_mapper(path), consolidated=True)
ds

Unnamed: 0,Array,Chunk
Bytes,36.59 MB,36.59 MB
Shape,"(365941, 25)","(365941, 25)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 36.59 MB 36.59 MB Shape (365941, 25) (365941, 25) Count 2 Tasks 1 Chunks Type float32 numpy.ndarray",25  365941,

Unnamed: 0,Array,Chunk
Bytes,36.59 MB,36.59 MB
Shape,"(365941, 25)","(365941, 25)"
Count,2 Tasks,1 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.18 GB,60.42 MB
Shape,"(8444, 365941)","(5216, 5792)"
Count,129 Tasks,128 Chunks
Type,float16,numpy.ndarray
"Array Chunk Bytes 6.18 GB 60.42 MB Shape (8444, 365941) (5216, 5792) Count 129 Tasks 128 Chunks Type float16 numpy.ndarray",365941  8444,

Unnamed: 0,Array,Chunk
Bytes,6.18 GB,60.42 MB
Shape,"(8444, 365941)","(5216, 5792)"
Count,129 Tasks,128 Chunks
Type,float16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.46 MB,23.17 kB
Shape,"(365941, 1)","(5792, 1)"
Count,65 Tasks,64 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.46 MB 23.17 kB Shape (365941, 1) (5792, 1) Count 65 Tasks 64 Chunks Type float32 numpy.ndarray",1  365941,

Unnamed: 0,Array,Chunk
Bytes,1.46 MB,23.17 kB
Shape,"(365941, 1)","(5792, 1)"
Count,65 Tasks,64 Chunks
Type,float32,numpy.ndarray


In [21]:
def gwas(XL, XC, Y):
    # Add intercept
    XC = da.concatenate([da.ones((XC.shape[0], 1), dtype=XC.dtype), XC], axis=1)
    
    # Rechunk along short axes
    XC = XC.rechunk((None, -1))
    Y = Y.rechunk((None, -1))
    dof = Y.shape[0] - XC.shape[1] - 1
    
    # Apply orthogonal projection to eliminate core covariates
    XLP = XL - XC @ da.linalg.lstsq(XC, XL)[0]
    YP = Y - XC @ da.linalg.lstsq(XC, Y)[0]

    # Estimate coefficients for each loop covariate
    XLPS = (XLP ** 2).sum(axis=0, keepdims=True).T
    B = (XLP.T @ YP) / XLPS

    # Compute residuals for each loop covariate and outcome separately
    YR = YP[:, np.newaxis, :] - XLP[..., np.newaxis] * B[np.newaxis, ...]
    RSS = (YR ** 2).sum(axis=0)
    
    # Get t-statistics for coefficient estimates and match to p-values
    T = B / np.sqrt(RSS / dof / XLPS)
    P = da.map_blocks(
        lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof), T, dtype="float64"
    )
    return xr.Dataset(dict(
        beta=(('variants','outcomes'), B), 
        pval=(('variants','outcomes'), P)
    ))

In [22]:
# Define the GWAS regressions
dsr = gwas(
    # Note: This (the largest) array needs to be rechunked due to scalability 
    # issues with da.matmul, specifically https://github.com/dask/dask/pull/6924.
    # See here for more details:
    # https://github.com/pystatgen/sgkit/issues/390#issuecomment-730660134
    ds.XL.data.rechunk((652, 5792)).T.astype('f4'), 
    ds.XC.data, 
    ds.Y.data
)
dsr

Unnamed: 0,Array,Chunk
Bytes,33.78 kB,2.61 kB
Shape,"(8444, 1)","(652, 1)"
Count,11271 Tasks,13 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 33.78 kB 2.61 kB Shape (8444, 1) (652, 1) Count 11271 Tasks 13 Chunks Type float32 numpy.ndarray",1  8444,

Unnamed: 0,Array,Chunk
Bytes,33.78 kB,2.61 kB
Shape,"(8444, 1)","(652, 1)"
Count,11271 Tasks,13 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,67.55 kB,5.22 kB
Shape,"(8444, 1)","(652, 1)"
Count,15846 Tasks,13 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 67.55 kB 5.22 kB Shape (8444, 1) (652, 1) Count 15846 Tasks 13 Chunks Type float64 numpy.ndarray",1  8444,

Unnamed: 0,Array,Chunk
Bytes,67.55 kB,5.22 kB
Shape,"(8444, 1)","(652, 1)"
Count,15846 Tasks,13 Chunks
Type,float64,numpy.ndarray


In [23]:
# dask in-memory
#with ProgressBar():
# dask distributed
with performance_report(f"reports/pr_{n}_{m}_{c}.html"):
    # Compute and save betas/p-values
    output_path = f"gs://rs-gwas-benchmark/sim_res_{n}_{m}_{c}.zarr"
    dsr.to_zarr(fsspec.get_mapper(output_path), mode='w', consolidated=True)
    print(f'Results saved to {output_path}')

Results saved to gs://rs-gwas-benchmark/sim_res_8444_365941_25.zarr
