# H12 with sgkit

This notebook is for running a H12 scan using sgkit, to reproduce the scikit-allel one (`h12_scans.ipynb`).

You need to have run `sgkit_import_haplotypes.ipynb` first to convert the data into sgkit format.

In [1]:
%run setup.ipynb

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from dask.diagnostics import ProgressBar
import sgkit as sg
import xarray as xr

First, let's inspect the input data. Note that it has a single chunk in the `samples` dimension, which is a requirement for running the popgen analyses.

In [4]:
ds = xr.open_zarr(str(here() / 'data/sgkit/ag1000g_haplotypes.zarr'), concat_characters=False)
ds

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.20 GB Shape (39604636, 1164, 2) (524288, 1142, 2) Count 153 Tasks 152 Chunks Type int8 numpy.ndarray",2  1164  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.20 GB Shape (39604636, 1164, 2) (524288, 1142, 2) Count 153 Tasks 152 Chunks Type bool numpy.ndarray",2  1164  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,37.25 kB,37.25 kB
Shape,"(1164,)","(1164,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 37.25 kB 37.25 kB Shape (1164,) (1164,) Count 2 Tasks 1 Chunks Type numpy.ndarray",1164  1,

Unnamed: 0,Array,Chunk
Bytes,37.25 kB,37.25 kB
Shape,"(1164,)","(1164,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,79.21 MB,8.39 MB
Shape,"(39604636, 2)","(4194304, 2)"
Count,11 Tasks,10 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 79.21 MB 8.39 MB Shape (39604636, 2) (4194304, 2) Count 11 Tasks 10 Chunks Type |S1 numpy.ndarray",2  39604636,

Unnamed: 0,Array,Chunk
Bytes,79.21 MB,8.39 MB
Shape,"(39604636, 2)","(4194304, 2)"
Count,11 Tasks,10 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 158.42 MB 16.78 MB Shape (39604636,) (4194304,) Count 11 Tasks 10 Chunks Type int32 numpy.ndarray",39604636  1,

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 158.42 MB 16.78 MB Shape (39604636,) (4194304,) Count 11 Tasks 10 Chunks Type int32 numpy.ndarray",39604636  1,

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray


## Cohorts

We need to divide the samples into separate cohorts, which we get from the `pop_defs` YAML:

In [5]:
cohort_ids = list(pop_defs.keys())
cohort_ids

['ao_col',
 'bf_col',
 'bf_gam',
 'ci_col',
 'cm_sav_gam',
 'fr_gam',
 'ga_gam',
 'gh_col',
 'gh_gam',
 'gm',
 'gn_gam',
 'gq_gam',
 'gw',
 'ke',
 'ug_gam']

In [6]:
ds["cohort_id"] = xr.DataArray(cohort_ids, dims="cohorts")
ds

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.20 GB Shape (39604636, 1164, 2) (524288, 1142, 2) Count 153 Tasks 152 Chunks Type int8 numpy.ndarray",2  1164  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.20 GB Shape (39604636, 1164, 2) (524288, 1142, 2) Count 153 Tasks 152 Chunks Type bool numpy.ndarray",2  1164  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,37.25 kB,37.25 kB
Shape,"(1164,)","(1164,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 37.25 kB 37.25 kB Shape (1164,) (1164,) Count 2 Tasks 1 Chunks Type numpy.ndarray",1164  1,

Unnamed: 0,Array,Chunk
Bytes,37.25 kB,37.25 kB
Shape,"(1164,)","(1164,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,79.21 MB,8.39 MB
Shape,"(39604636, 2)","(4194304, 2)"
Count,11 Tasks,10 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 79.21 MB 8.39 MB Shape (39604636, 2) (4194304, 2) Count 11 Tasks 10 Chunks Type |S1 numpy.ndarray",2  39604636,

Unnamed: 0,Array,Chunk
Bytes,79.21 MB,8.39 MB
Shape,"(39604636, 2)","(4194304, 2)"
Count,11 Tasks,10 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 158.42 MB 16.78 MB Shape (39604636,) (4194304,) Count 11 Tasks 10 Chunks Type int32 numpy.ndarray",39604636  1,

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 158.42 MB 16.78 MB Shape (39604636,) (4194304,) Count 11 Tasks 10 Chunks Type int32 numpy.ndarray",39604636  1,

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray


Sample metadata is in the `df_samples` dataframe, so we can use that to produce a mapping from sample to cohort

In [7]:
sample_cohorts = np.full_like(ds.sample_id.values, -1, dtype=np.int8)
for i, pop in enumerate(cohort_ids):
    pop_query = (
            pop_defs[pop]['query']
            .replace('region', 'location')
            .replace('Gado-Badzere', 'Gado Badzere')
            .replace('Zembe-Borongo', 'Zembe Borongo')
    )
    loc_pop = df_samples.query(pop_query).index.values
    sample_cohorts[loc_pop] = i
sample_cohorts

array([ 7,  7,  7, ..., -1, -1, -1], dtype=int8)

Add `sample_cohort` to the dataset

In [8]:
ds["sample_cohort"] = xr.DataArray(sample_cohorts, dims="samples")
ds

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.20 GB Shape (39604636, 1164, 2) (524288, 1142, 2) Count 153 Tasks 152 Chunks Type int8 numpy.ndarray",2  1164  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.20 GB Shape (39604636, 1164, 2) (524288, 1142, 2) Count 153 Tasks 152 Chunks Type bool numpy.ndarray",2  1164  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,37.25 kB,37.25 kB
Shape,"(1164,)","(1164,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 37.25 kB 37.25 kB Shape (1164,) (1164,) Count 2 Tasks 1 Chunks Type numpy.ndarray",1164  1,

Unnamed: 0,Array,Chunk
Bytes,37.25 kB,37.25 kB
Shape,"(1164,)","(1164,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,79.21 MB,8.39 MB
Shape,"(39604636, 2)","(4194304, 2)"
Count,11 Tasks,10 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 79.21 MB 8.39 MB Shape (39604636, 2) (4194304, 2) Count 11 Tasks 10 Chunks Type |S1 numpy.ndarray",2  39604636,

Unnamed: 0,Array,Chunk
Bytes,79.21 MB,8.39 MB
Shape,"(39604636, 2)","(4194304, 2)"
Count,11 Tasks,10 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 158.42 MB 16.78 MB Shape (39604636,) (4194304,) Count 11 Tasks 10 Chunks Type int32 numpy.ndarray",39604636  1,

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 158.42 MB 16.78 MB Shape (39604636,) (4194304,) Count 11 Tasks 10 Chunks Type int32 numpy.ndarray",39604636  1,

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray


Some samples are not in any of the named cohorts, and have -1 in the `sample_cohort` variable. These are ignored in cohort allele counts.

## Windowing

To compute popgen stats we need to set up windows along the genome. H12 actually has windows that vary according to the population cohort, but for now we'll use a fixed window size of 6000 (step size is half that).

In [9]:
ds = sg.window(ds, size=6000, step=3000)
ds

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.20 GB Shape (39604636, 1164, 2) (524288, 1142, 2) Count 153 Tasks 152 Chunks Type int8 numpy.ndarray",2  1164  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.20 GB Shape (39604636, 1164, 2) (524288, 1142, 2) Count 153 Tasks 152 Chunks Type bool numpy.ndarray",2  1164  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,37.25 kB,37.25 kB
Shape,"(1164,)","(1164,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 37.25 kB 37.25 kB Shape (1164,) (1164,) Count 2 Tasks 1 Chunks Type numpy.ndarray",1164  1,

Unnamed: 0,Array,Chunk
Bytes,37.25 kB,37.25 kB
Shape,"(1164,)","(1164,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,79.21 MB,8.39 MB
Shape,"(39604636, 2)","(4194304, 2)"
Count,11 Tasks,10 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 79.21 MB 8.39 MB Shape (39604636, 2) (4194304, 2) Count 11 Tasks 10 Chunks Type |S1 numpy.ndarray",2  39604636,

Unnamed: 0,Array,Chunk
Bytes,79.21 MB,8.39 MB
Shape,"(39604636, 2)","(4194304, 2)"
Count,11 Tasks,10 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 158.42 MB 16.78 MB Shape (39604636,) (4194304,) Count 11 Tasks 10 Chunks Type int32 numpy.ndarray",39604636  1,

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 158.42 MB 16.78 MB Shape (39604636,) (4194304,) Count 11 Tasks 10 Chunks Type int32 numpy.ndarray",39604636  1,

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray


## H12

We are now in a position to calculate the H statistics.

In [13]:
h = sg.to_call_haplotypes(ds, merge=False)
h

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 2328)","(524288, 2284)"
Count,305 Tasks,152 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.20 GB Shape (39604636, 2328) (524288, 2284) Count 305 Tasks 152 Chunks Type int8 numpy.ndarray",2328  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 2328)","(524288, 2284)"
Count,305 Tasks,152 Chunks
Type,int8,numpy.ndarray


In [14]:
with ProgressBar():
    h.to_zarr(str(here() / 'data/sgkit/ag1000g_ht.zarr'), mode="w")

[########################################] | 100% Completed |  1min 53.0s


Now rechunk on disk (since the haplotypes dimension has two chunks - we want a single one)

In [17]:
source_group = zarr.open(str(here() / 'data/sgkit/ag1000g_ht.zarr'))
target_chunks = {"call_haplotype": (524288, 2328)}
max_mem = '2GB'

target_store = str(here() / 'data/sgkit/ag1000g_ht_rechunked.zarr')
temp_store = str(here() / 'data/sgkit/ag1000g_ht_rechunked_tmp.zarr')

In [18]:
from rechunker import api as rechunker_api
plan = rechunker_api.rechunk(source_group, target_chunks, max_mem, target_store, temp_store=temp_store)

In [19]:
with ProgressBar():
    plan.execute()

[########################################] | 100% Completed |  5min 40.7s


In [10]:
ht = xr.open_zarr(str(here() / 'data/sgkit/ag1000g_ht_rechunked.zarr'), concat_characters=False)
ht

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.22 GB
Shape,"(39604636, 2328)","(524288, 2328)"
Count,77 Tasks,76 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.22 GB Shape (39604636, 2328) (524288, 2328) Count 77 Tasks 76 Chunks Type int8 numpy.ndarray",2328  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.22 GB
Shape,"(39604636, 2328)","(524288, 2328)"
Count,77 Tasks,76 Chunks
Type,int8,numpy.ndarray


Combine

In [11]:
ds2 = xr.merge([ds, ht])
ds2

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.20 GB Shape (39604636, 1164, 2) (524288, 1142, 2) Count 153 Tasks 152 Chunks Type int8 numpy.ndarray",2  1164  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.20 GB Shape (39604636, 1164, 2) (524288, 1142, 2) Count 153 Tasks 152 Chunks Type bool numpy.ndarray",2  1164  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.20 GB
Shape,"(39604636, 1164, 2)","(524288, 1142, 2)"
Count,153 Tasks,152 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,37.25 kB,37.25 kB
Shape,"(1164,)","(1164,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 37.25 kB 37.25 kB Shape (1164,) (1164,) Count 2 Tasks 1 Chunks Type numpy.ndarray",1164  1,

Unnamed: 0,Array,Chunk
Bytes,37.25 kB,37.25 kB
Shape,"(1164,)","(1164,)"
Count,2 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,79.21 MB,8.39 MB
Shape,"(39604636, 2)","(4194304, 2)"
Count,11 Tasks,10 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 79.21 MB 8.39 MB Shape (39604636, 2) (4194304, 2) Count 11 Tasks 10 Chunks Type |S1 numpy.ndarray",2  39604636,

Unnamed: 0,Array,Chunk
Bytes,79.21 MB,8.39 MB
Shape,"(39604636, 2)","(4194304, 2)"
Count,11 Tasks,10 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 158.42 MB 16.78 MB Shape (39604636,) (4194304,) Count 11 Tasks 10 Chunks Type int32 numpy.ndarray",39604636  1,

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 158.42 MB 16.78 MB Shape (39604636,) (4194304,) Count 11 Tasks 10 Chunks Type int32 numpy.ndarray",39604636  1,

Unnamed: 0,Array,Chunk
Bytes,158.42 MB,16.78 MB
Shape,"(39604636,)","(4194304,)"
Count,11 Tasks,10 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.22 GB
Shape,"(39604636, 2328)","(524288, 2328)"
Count,77 Tasks,76 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 92.20 GB 1.22 GB Shape (39604636, 2328) (524288, 2328) Count 77 Tasks 76 Chunks Type int8 numpy.ndarray",2328  39604636,

Unnamed: 0,Array,Chunk
Bytes,92.20 GB,1.22 GB
Shape,"(39604636, 2328)","(524288, 2328)"
Count,77 Tasks,76 Chunks
Type,int8,numpy.ndarray


In [12]:
h2 = sg.Garud_h(ds2, merge=False)
h2

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,770 Tasks,76 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.58 MB 21.00 kB Shape (13203, 15) (175, 15) Count 770 Tasks 76 Chunks Type float64 numpy.ndarray",15  13203,

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,770 Tasks,76 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,770 Tasks,76 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.58 MB 21.00 kB Shape (13203, 15) (175, 15) Count 770 Tasks 76 Chunks Type float64 numpy.ndarray",15  13203,

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,770 Tasks,76 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,770 Tasks,76 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.58 MB 21.00 kB Shape (13203, 15) (175, 15) Count 770 Tasks 76 Chunks Type float64 numpy.ndarray",15  13203,

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,770 Tasks,76 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,770 Tasks,76 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.58 MB 21.00 kB Shape (13203, 15) (175, 15) Count 770 Tasks 76 Chunks Type float64 numpy.ndarray",15  13203,

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,770 Tasks,76 Chunks
Type,float64,numpy.ndarray


In [13]:
with ProgressBar():
    h2 = h2.chunk({"windows": 175}) # rechunk to uniform window sizes so we can save to zarr
    h2.to_zarr(str(here() / 'data/sgkit/ag1000g_h12.zarr'), mode="w")

[########################################] | 100% Completed |  7min 41.4s


In [14]:
h = xr.open_zarr(str(here() / 'data/sgkit/ag1000g_h12.zarr'), concat_characters=False)
h = h.assign_coords({"cohorts": list(pop_defs)})
h

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,77 Tasks,76 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.58 MB 21.00 kB Shape (13203, 15) (175, 15) Count 77 Tasks 76 Chunks Type float64 numpy.ndarray",15  13203,

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,77 Tasks,76 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,77 Tasks,76 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.58 MB 21.00 kB Shape (13203, 15) (175, 15) Count 77 Tasks 76 Chunks Type float64 numpy.ndarray",15  13203,

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,77 Tasks,76 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,77 Tasks,76 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.58 MB 21.00 kB Shape (13203, 15) (175, 15) Count 77 Tasks 76 Chunks Type float64 numpy.ndarray",15  13203,

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,77 Tasks,76 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,77 Tasks,76 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.58 MB 21.00 kB Shape (13203, 15) (175, 15) Count 77 Tasks 76 Chunks Type float64 numpy.ndarray",15  13203,

Unnamed: 0,Array,Chunk
Bytes,1.58 MB,21.00 kB
Shape,"(13203, 15)","(175, 15)"
Count,77 Tasks,76 Chunks
Type,float64,numpy.ndarray


Have a look at the H12 values for a given cohort:

In [23]:
stat_h12 = h["stat_Garud_h12"].sel(cohorts="ao_col")[:1000].values
stat_h12

array([0.03591387, 0.04511834, 0.0591716 , 0.08144313, 0.08136095,
       0.07569034, 0.05136423, 0.02514793, 0.03131164, 0.05530901,
       0.07166338, 0.07618343, 0.04791256, 0.06204799, 0.07240302,
       0.05481591, 0.05613083, 0.06804734, 0.06780079, 0.06517094,
       0.08793557, 0.08694938, 0.05374753, 0.04092702, 0.04322814,
       0.07363577, 0.04413215, 0.03640697, 0.0433925 , 0.03714661,
       0.046762  , 0.03813281, 0.03295529, 0.03690007, 0.05193951,
       0.04133794, 0.03131164, 0.03369494, 0.03525641, 0.02638067,
       0.02966798, 0.02777778, 0.02169625, 0.02473702, 0.0190664 ,
       0.02095661, 0.01734057, 0.01569691, 0.01840894, 0.01816239,
       0.01545036, 0.01405325, 0.01084813, 0.00920447, 0.01158777,
       0.01881986, 0.01783366, 0.01791584, 0.01857331, 0.01454635,
       0.01536818, 0.01487508, 0.01840894, 0.02539448, 0.01964168,
       0.01454635, 0.01692965, 0.0176693 , 0.02276463, 0.01939513,
       0.01619001, 0.01873767, 0.01602564, 0.01602564, 0.01931

Concordance with scikit-allel

In [16]:
h12_root = zarr.open(str(here() / 'data/gwss/h12/h12.zarr'))

In [17]:
def get_scikit_allel_h12(pop, chromosome, window_size=None, window_step=200, markersize=1):
    if not window_size:
        window_size = pop_defs[pop]['h12_window_size']
    if not window_step:
        window_step = window_size // 2
    
    # setup zarr group to store data
    grp_path = f'/{pop}/{window_size}/{window_step}/{chromosome}'
    grp = h12_root.require_group(grp_path)
    complete = grp.attrs.get('complete', False)
    
    if complete:
        # previously run, load from zarr
        windows = grp['windows'][:]
        gwindows = grp['gwindows'][:]
        h1 = grp['h1'][:]
        h12 = grp['h12'][:]
        h123 = grp['h123'][:]
        h2_h1 = grp['h2_h1'][:]
        
        return windows, gwindows, h1, h12, h123, h2_h1
        
    return None

In [19]:
ska_windows, ska_gwindows, ska_h1, ska_h12, ska_h123, ska_h2_h1 = get_scikit_allel_h12("ao_col", "2", 6000, 3000)

In [20]:
ska_h12[:100]

array([0.03591387, 0.04511834, 0.0591716 , 0.08144313, 0.08136095,
       0.07569034, 0.05136423, 0.02514793, 0.03131164, 0.05530901,
       0.07166338, 0.07618343, 0.04791256, 0.06204799, 0.07240302,
       0.05481591, 0.05613083, 0.06804734, 0.06780079, 0.06517094,
       0.08793557, 0.08694938, 0.05374753, 0.04092702, 0.04322814,
       0.07363577, 0.04413215, 0.03640697, 0.0433925 , 0.03714661,
       0.046762  , 0.03813281, 0.03295529, 0.03690007, 0.05193951,
       0.04133794, 0.03131164, 0.03369494, 0.03525641, 0.02638067,
       0.02966798, 0.02777778, 0.02169625, 0.02473702, 0.0190664 ,
       0.02095661, 0.01734057, 0.01569691, 0.01840894, 0.01816239,
       0.01545036, 0.01405325, 0.01084813, 0.00920447, 0.01158777,
       0.01881986, 0.01783366, 0.01791584, 0.01857331, 0.01454635,
       0.01536818, 0.01487508, 0.01840894, 0.02539448, 0.01964168,
       0.01454635, 0.01692965, 0.0176693 , 0.02276463, 0.01939513,
       0.01619001, 0.01873767, 0.01602564, 0.01602564, 0.01931

Are they equal? (Compare first windows)

In [24]:
import numpy as np
np.testing.assert_allclose(stat_h12[:1000], ska_h12[:1000])