## Sgkit GWAS Workflow Example - Chunk Report

Chunk report for the GWAS workflow that shows the shape and chunk size for each intermediate variable in the calculation.

In [266]:
%run setup.ipynb

In [267]:
%run matmul.ipynb

In [268]:
def create_dataset(n, m, c):
    rs = da.random.RandomState(0)
    XL, BL = rs.randint(0, 128, size=(n, m), chunks=(163, 11584)), da.array([1] + [0] * (m - 1))
    XC, BC = rs.normal(size=(m, c)), rs.normal(size=(c,))
    Y = (XL * BL).sum(axis=0) + XC @ BC + rs.normal(scale=.001, size=m)
    ds = xr.Dataset(dict(
        # This is a proxy for discretized allele dosages (between 0 and 2)
        XL=(('variants', 'samples'), (2 * XL / 127).astype('f2')),
        # This value represents covariates for samples, e.g. age, sex, ancestry, etc.
        XC=(('samples', 'covariates'), XC.astype('f4')),
        # This is the outcome on which all variant data will be regressed separately
        Y=(('samples', 'outcomes'), Y[:, np.newaxis].astype('f4')),
    ))
    return ds

In [269]:
# Choose dataset size
# Debug settings
n = 500 # Number of variants (i.e. genomic locations)
m = 1000 # Number of individuals (i.e. people)
c = 3    # Number of covariates (i.e. confounders)

# Representative settings for single (small) UK Biobank chromosome:
# n, m, c = 141910, 365941, 25

# XY chromosome
n, m, c = 8444, 365941, 25
# factor to multiply XY chromosome variants by (to measure scaling)
# note that chr11 is approx factor=16
factor = 8
n = n * factor

In [270]:
ds = create_dataset(n, m, c)

For comparison, here is the `gwas` function that is broken down below to show shapes and chunk sizes for each intermediate variable.

In [271]:
def gwas(XL, XC, Y):
    # Add intercept
    XC = da.concatenate([da.ones((XC.shape[0], 1), dtype=XC.dtype), XC], axis=1)
    
    # Rechunk along short axes
    XC = XC.rechunk((None, -1))
    Y = Y.rechunk((None, -1))
    dof = Y.shape[0] - XC.shape[1] - 1
    
    # Apply orthogonal projection to eliminate core covariates
    #XLP = XL - XC @ da.linalg.lstsq(XC, XL)[0]
    #YP = Y - XC @ da.linalg.lstsq(XC, Y)[0]
    XLP = XL - matmul(XC, da.linalg.lstsq(XC, XL)[0])
    YP = Y - matmul(XC, da.linalg.lstsq(XC, Y)[0])

    # Estimate coefficients for each loop covariate
    XLPS = (XLP ** 2).sum(axis=0, keepdims=True).T
    #B = (XLP.T @ YP) / XLPS
    B = matmul(XLP.T, YP) / XLPS

    # Compute residuals for each loop covariate and outcome separately
    YR = YP[:, np.newaxis, :] - XLP[..., np.newaxis] * B[np.newaxis, ...]
    RSS = (YR ** 2).sum(axis=0)
    
    # Get t-statistics for coefficient estimates and match to p-values
    T = B / np.sqrt(RSS / dof / XLPS)
    P = da.map_blocks(
        lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof), T, dtype="float64"
    )
    return xr.Dataset(dict(
        beta=(('variants','outcomes'), B), 
        pval=(('variants','outcomes'), P)
    ))
# Define the GWAS regressions
dsr = gwas(
    # Note: This (the largest) array needs to be rechunked due to scalability 
    # issues with da.matmul, specifically https://github.com/dask/dask/pull/6924.
    # See here for more details:
    # https://github.com/pystatgen/sgkit/issues/390#issuecomment-730660134
    ds.XL.data.rechunk((652, 5792)).T.astype('f4'), 
    ds.XC.data, 
    ds.Y.data
)

In [272]:
XL = ds.XL.data
XL = XL.rechunk((652, 5792))
XL

Unnamed: 0,Array,Chunk
Bytes,49.44 GB,7.55 MB
Shape,"(67552, 365941)","(652, 5792)"
Count,86336 Tasks,6656 Chunks
Type,float16,numpy.ndarray
"Array Chunk Bytes 49.44 GB 7.55 MB Shape (67552, 365941) (652, 5792) Count 86336 Tasks 6656 Chunks Type float16 numpy.ndarray",365941  67552,

Unnamed: 0,Array,Chunk
Bytes,49.44 GB,7.55 MB
Shape,"(67552, 365941)","(652, 5792)"
Count,86336 Tasks,6656 Chunks
Type,float16,numpy.ndarray


In [273]:
XL = XL.T.astype('f4')
XL

Unnamed: 0,Array,Chunk
Bytes,98.88 GB,15.11 MB
Shape,"(365941, 67552)","(5792, 652)"
Count,99648 Tasks,6656 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 98.88 GB 15.11 MB Shape (365941, 67552) (5792, 652) Count 99648 Tasks 6656 Chunks Type float32 numpy.ndarray",67552  365941,

Unnamed: 0,Array,Chunk
Bytes,98.88 GB,15.11 MB
Shape,"(365941, 67552)","(5792, 652)"
Count,99648 Tasks,6656 Chunks
Type,float32,numpy.ndarray


In [274]:
XC = ds.XC.data
XC = da.concatenate([da.ones((XC.shape[0], 1), dtype=XC.dtype), XC], axis=1) # Add intercept
XC = XC.rechunk((None, -1))
XC

Unnamed: 0,Array,Chunk
Bytes,38.06 MB,38.06 MB
Shape,"(365941, 26)","(365941, 26)"
Count,6 Tasks,1 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 38.06 MB 38.06 MB Shape (365941, 26) (365941, 26) Count 6 Tasks 1 Chunks Type float32 numpy.ndarray",26  365941,

Unnamed: 0,Array,Chunk
Bytes,38.06 MB,38.06 MB
Shape,"(365941, 26)","(365941, 26)"
Count,6 Tasks,1 Chunks
Type,float32,numpy.ndarray


In [275]:
# Apply orthogonal projection to eliminate core covariates
LS = da.linalg.lstsq(XC, XL)[0]
#LS = LS.rechunk((None, 326))
LS

Unnamed: 0,Array,Chunk
Bytes,7.03 MB,67.81 kB
Shape,"(26, 67552)","(26, 652)"
Count,115387 Tasks,104 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 7.03 MB 67.81 kB Shape (26, 67552) (26, 652) Count 115387 Tasks 104 Chunks Type float32 numpy.ndarray",67552  26,

Unnamed: 0,Array,Chunk
Bytes,7.03 MB,67.81 kB
Shape,"(26, 67552)","(26, 652)"
Count,115387 Tasks,104 Chunks
Type,float32,numpy.ndarray


In [276]:
XC @ LS

Unnamed: 0,Array,Chunk
Bytes,98.88 GB,954.37 MB
Shape,"(365941, 67552)","(365941, 652)"
Count,115491 Tasks,104 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 98.88 GB 954.37 MB Shape (365941, 67552) (365941, 652) Count 115491 Tasks 104 Chunks Type float32 numpy.ndarray",67552  365941,

Unnamed: 0,Array,Chunk
Bytes,98.88 GB,954.37 MB
Shape,"(365941, 67552)","(365941, 652)"
Count,115491 Tasks,104 Chunks
Type,float32,numpy.ndarray


In [277]:
XLP = XL - XC @ LS
XLP

Unnamed: 0,Array,Chunk
Bytes,98.88 GB,15.11 MB
Shape,"(365941, 67552)","(5792, 652)"
Count,135459 Tasks,6656 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 98.88 GB 15.11 MB Shape (365941, 67552) (5792, 652) Count 135459 Tasks 6656 Chunks Type float32 numpy.ndarray",67552  365941,

Unnamed: 0,Array,Chunk
Bytes,98.88 GB,15.11 MB
Shape,"(365941, 67552)","(5792, 652)"
Count,135459 Tasks,6656 Chunks
Type,float32,numpy.ndarray


In [278]:
# Estimate coefficients for each loop covariate
XLPS = (XLP ** 2).sum(axis=0, keepdims=True).T
XLPS

Unnamed: 0,Array,Chunk
Bytes,270.21 kB,2.61 kB
Shape,"(67552, 1)","(652, 1)"
Count,151059 Tasks,104 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 270.21 kB 2.61 kB Shape (67552, 1) (652, 1) Count 151059 Tasks 104 Chunks Type float32 numpy.ndarray",1  67552,

Unnamed: 0,Array,Chunk
Bytes,270.21 kB,2.61 kB
Shape,"(67552, 1)","(652, 1)"
Count,151059 Tasks,104 Chunks
Type,float32,numpy.ndarray


In [279]:
Y = ds.Y.data
Y = Y.rechunk((None, -1))
Y

Unnamed: 0,Array,Chunk
Bytes,1.46 MB,46.34 kB
Shape,"(365941, 1)","(11584, 1)"
Count,44647 Tasks,32 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.46 MB 46.34 kB Shape (365941, 1) (11584, 1) Count 44647 Tasks 32 Chunks Type float32 numpy.ndarray",1  365941,

Unnamed: 0,Array,Chunk
Bytes,1.46 MB,46.34 kB
Shape,"(365941, 1)","(11584, 1)"
Count,44647 Tasks,32 Chunks
Type,float32,numpy.ndarray


In [280]:
# Apply orthogonal projection to eliminate core covariates
YP = Y - matmul(XC, da.linalg.lstsq(XC, Y)[0])
YP

Unnamed: 0,Array,Chunk
Bytes,1.46 MB,46.34 kB
Shape,"(365941, 1)","(11584, 1)"
Count,44895 Tasks,32 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.46 MB 46.34 kB Shape (365941, 1) (11584, 1) Count 44895 Tasks 32 Chunks Type float32 numpy.ndarray",1  365941,

Unnamed: 0,Array,Chunk
Bytes,1.46 MB,46.34 kB
Shape,"(365941, 1)","(11584, 1)"
Count,44895 Tasks,32 Chunks
Type,float32,numpy.ndarray


In [281]:
XLP.T @ YP

Unnamed: 0,Array,Chunk
Bytes,270.21 kB,2.61 kB
Shape,"(67552, 1)","(652, 1)"
Count,173951 Tasks,104 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 270.21 kB 2.61 kB Shape (67552, 1) (652, 1) Count 173951 Tasks 104 Chunks Type float32 numpy.ndarray",1  67552,

Unnamed: 0,Array,Chunk
Bytes,270.21 kB,2.61 kB
Shape,"(67552, 1)","(652, 1)"
Count,173951 Tasks,104 Chunks
Type,float32,numpy.ndarray


In [282]:
B = (XLP.T @ YP) / XLPS
B

Unnamed: 0,Array,Chunk
Bytes,270.21 kB,2.61 kB
Shape,"(67552, 1)","(652, 1)"
Count,189655 Tasks,104 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 270.21 kB 2.61 kB Shape (67552, 1) (652, 1) Count 189655 Tasks 104 Chunks Type float32 numpy.ndarray",1  67552,

Unnamed: 0,Array,Chunk
Bytes,270.21 kB,2.61 kB
Shape,"(67552, 1)","(652, 1)"
Count,189655 Tasks,104 Chunks
Type,float32,numpy.ndarray


In [283]:
YR = YP[:, np.newaxis, :] - XLP[..., np.newaxis] * B[np.newaxis, ...]
YR

Unnamed: 0,Array,Chunk
Bytes,98.88 GB,15.11 MB
Shape,"(365941, 67552, 1)","(5792, 652, 1)"
Count,209887 Tasks,6656 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 98.88 GB 15.11 MB Shape (365941, 67552, 1) (5792, 652, 1) Count 209887 Tasks 6656 Chunks Type float32 numpy.ndarray",1  67552  365941,

Unnamed: 0,Array,Chunk
Bytes,98.88 GB,15.11 MB
Shape,"(365941, 67552, 1)","(5792, 652, 1)"
Count,209887 Tasks,6656 Chunks
Type,float32,numpy.ndarray


In [284]:
RSS = (YR ** 2).sum(axis=0)
RSS

Unnamed: 0,Array,Chunk
Bytes,270.21 kB,2.61 kB
Shape,"(67552, 1)","(652, 1)"
Count,225383 Tasks,104 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 270.21 kB 2.61 kB Shape (67552, 1) (652, 1) Count 225383 Tasks 104 Chunks Type float32 numpy.ndarray",1  67552,

Unnamed: 0,Array,Chunk
Bytes,270.21 kB,2.61 kB
Shape,"(67552, 1)","(652, 1)"
Count,225383 Tasks,104 Chunks
Type,float32,numpy.ndarray


In [285]:
dof = Y.shape[0] - XC.shape[1] - 1
dof

365914

In [286]:
T = B / np.sqrt(RSS / dof / XLPS)
T

Unnamed: 0,Array,Chunk
Bytes,540.42 kB,5.22 kB
Shape,"(67552, 1)","(652, 1)"
Count,225799 Tasks,104 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 540.42 kB 5.22 kB Shape (67552, 1) (652, 1) Count 225799 Tasks 104 Chunks Type float64 numpy.ndarray",1  67552,

Unnamed: 0,Array,Chunk
Bytes,540.42 kB,5.22 kB
Shape,"(67552, 1)","(652, 1)"
Count,225799 Tasks,104 Chunks
Type,float64,numpy.ndarray


In [287]:
P = da.map_blocks(
        lambda t: 2 * stats.distributions.t.sf(np.abs(t), dof), T, dtype="float64"
    )
P

Unnamed: 0,Array,Chunk
Bytes,540.42 kB,5.22 kB
Shape,"(67552, 1)","(652, 1)"
Count,225903 Tasks,104 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 540.42 kB 5.22 kB Shape (67552, 1) (652, 1) Count 225903 Tasks 104 Chunks Type float64 numpy.ndarray",1  67552,

Unnamed: 0,Array,Chunk
Bytes,540.42 kB,5.22 kB
Shape,"(67552, 1)","(652, 1)"
Count,225903 Tasks,104 Chunks
Type,float64,numpy.ndarray


In [292]:
# approx guess of which arrays need to be kept in memory at once
arrs = [XL, XLP, YR]
total_mem = sum([arr.nbytes for arr in arrs])
dask.utils.format_bytes(total_mem)

'296.64 GB'

In [293]:
# approx how many workers do we need for this amount of memory in the cluster?
n1_standard_8_mem = 30 * 1_000_000_000
total_mem // n1_standard_8_mem

9