# sgkit rechunking

In [1]:
%run setup.ipynb

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from dask.diagnostics import ProgressBar

In [4]:
output = here() / 'data/sgkit/ag1000g.zarr'

## Do manually just on call_genotype

In [5]:
source_group = zarr.open(str(output))
source = source_group["call_genotype"]
source.info

0,1
Name,/call_genotype
Type,zarr.core.Array
Data type,int8
Shape,"(57837885, 1142, 2)"
Chunk shape,"(524288, 61, 2)"
Order,C
Read-only,False
Compressor,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)"
Store type,zarr.storage.DirectoryStore
No. bytes,132101729340 (123.0G)


In [43]:
target_chunks = (524288, 1142, 2)
max_mem = '2GB'

target_store = str(here() / 'data/sgkit/ag1000g_rechunked2.zarr')
temp_store = str(here() / 'data/sgkit/ag1000g_rechunked2_tmp.zarr')

In [44]:
from rechunker import api as rechunker_api
plan = rechunker_api.rechunk(source, target_chunks, max_mem, target_store)
plan

In [12]:
with ProgressBar():
    plan.execute()

[########################################] | 100% Completed |  2min  1.4s


In [17]:
! du -sh ../../data/sgkit/ag1000g_rechunked2.zarr

6.5G	../../data/sgkit/ag1000g_rechunked2.zarr


In [21]:
! du -sh ../../data/sgkit/ag1000g.zarr/call_genotype

6.7G	../../data/sgkit/ag1000g.zarr/call_genotype


## Do using rechunker's zarr groups

This seems to be the best way to rechunk a sgkit dataset.

In [20]:
source_group = zarr.open(str(output))
target_chunks = {"call_genotype": (524288, 1142, 2), "call_genotype_mask": (524288, 1142, 2), "sample_id": None, "variant_allele": None, "variant_contig": None, "variant_position": None}
max_mem = '2GB'

target_store = str(here() / 'data/sgkit/ag1000g_rechunked4.zarr')
temp_store = str(here() / 'data/sgkit/ag1000g_rechunked4_tmp.zarr')

In [21]:
from rechunker import api as rechunker_api
plan = rechunker_api.rechunk(source_group, target_chunks, max_mem, target_store)
plan

In [22]:
with ProgressBar():
    plan.execute()

[########################################] | 100% Completed |  3min 50.2s


In [33]:
! du -sh ../../data/sgkit/ag1000g_rechunked4.zarr

7.2G	../../data/sgkit/ag1000g_rechunked4.zarr


In [28]:
source_group = zarr.open(str(output))
target_chunks = {"call_genotype": (65536, 1142, 2), "call_genotype_mask": (65536, 1142, 2), "sample_id": None, "variant_allele": (65536, 4), "variant_contig": (65536,), "variant_position": (65536,)}
max_mem = '2GB'

target_store = str(here() / 'data/sgkit/ag1000g_rechunked4.zarr')
temp_store = str(here() / 'data/sgkit/ag1000g_rechunked4_tmp.zarr')

In [31]:
from rechunker import api as rechunker_api
plan = rechunker_api.rechunk(source_group, target_chunks, max_mem, target_store, temp_store=temp_store)
plan

In [32]:
with ProgressBar():
    plan.execute()

[########################################] | 100% Completed |  9min 33.3s


## Do using rechunker's xarray support

In [5]:
import xarray as xr
ds = xr.open_zarr(str(output), concat_characters=False)
ds

Unnamed: 0,Array,Chunk
Bytes,132.10 GB,63.96 MB
Shape,"(57837885, 1142, 2)","(524288, 61, 2)"
Count,2110 Tasks,2109 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 132.10 GB 63.96 MB Shape (57837885, 1142, 2) (524288, 61, 2) Count 2110 Tasks 2109 Chunks Type int8 numpy.ndarray",2  1142  57837885,

Unnamed: 0,Array,Chunk
Bytes,132.10 GB,63.96 MB
Shape,"(57837885, 1142, 2)","(524288, 61, 2)"
Count,2110 Tasks,2109 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,132.10 GB,63.96 MB
Shape,"(57837885, 1142, 2)","(524288, 61, 2)"
Count,2110 Tasks,2109 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 132.10 GB 63.96 MB Shape (57837885, 1142, 2) (524288, 61, 2) Count 2110 Tasks 2109 Chunks Type bool numpy.ndarray",2  1142  57837885,

Unnamed: 0,Array,Chunk
Bytes,132.10 GB,63.96 MB
Shape,"(57837885, 1142, 2)","(524288, 61, 2)"
Count,2110 Tasks,2109 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,9.14 kB,9.14 kB
Shape,"(1142,)","(1142,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 9.14 kB 9.14 kB Shape (1142,) (1142,) Count 2 Tasks 1 Chunks Type object numpy.ndarray",1142  1,

Unnamed: 0,Array,Chunk
Bytes,9.14 kB,9.14 kB
Shape,"(1142,)","(1142,)"
Count,2 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,231.35 MB,16.78 MB
Shape,"(57837885, 4)","(4194304, 4)"
Count,15 Tasks,14 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 231.35 MB 16.78 MB Shape (57837885, 4) (4194304, 4) Count 15 Tasks 14 Chunks Type |S1 numpy.ndarray",4  57837885,

Unnamed: 0,Array,Chunk
Bytes,231.35 MB,16.78 MB
Shape,"(57837885, 4)","(4194304, 4)"
Count,15 Tasks,14 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,231.35 MB,16.78 MB
Shape,"(57837885,)","(4194304,)"
Count,15 Tasks,14 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 231.35 MB 16.78 MB Shape (57837885,) (4194304,) Count 15 Tasks 14 Chunks Type int32 numpy.ndarray",57837885  1,

Unnamed: 0,Array,Chunk
Bytes,231.35 MB,16.78 MB
Shape,"(57837885,)","(4194304,)"
Count,15 Tasks,14 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,231.35 MB,16.78 MB
Shape,"(57837885,)","(4194304,)"
Count,15 Tasks,14 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 231.35 MB 16.78 MB Shape (57837885,) (4194304,) Count 15 Tasks 14 Chunks Type int32 numpy.ndarray",57837885  1,

Unnamed: 0,Array,Chunk
Bytes,231.35 MB,16.78 MB
Shape,"(57837885,)","(4194304,)"
Count,15 Tasks,14 Chunks
Type,int32,numpy.ndarray


In [45]:
target_chunks = {"call_genotype": (524288, 1142, 2)}
max_mem = '2GB'

target_store = str(here() / 'data/sgkit/ag1000g_rechunked3.zarr')
temp_store = str(here() / 'data/sgkit/ag1000g_rechunked3_tmp.zarr')

In [47]:
from rechunker import api as rechunker_api
ds_cg = ds.call_genotype.to_dataset()
plan = rechunker_api.rechunk(ds_cg, target_chunks, max_mem, target_store, temp_store=temp_store)
plan

Unnamed: 0,Array,Chunk
Bytes,132.10 GB,63.96 MB
Shape,"(57837885, 1142, 2)","(524288, 61, 2)"
Count,2110 Tasks,2109 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 132.10 GB 63.96 MB Shape (57837885, 1142, 2) (524288, 61, 2) Count 2110 Tasks 2109 Chunks Type int8 numpy.ndarray",2  1142  57837885,

Unnamed: 0,Array,Chunk
Bytes,132.10 GB,63.96 MB
Shape,"(57837885, 1142, 2)","(524288, 61, 2)"
Count,2110 Tasks,2109 Chunks
Type,int8,numpy.ndarray


In [51]:
plan._plan

Delayed('store-70871686-0ecf-11eb-afae-600308a8a26a')

In [19]:
ds.call_genotype.data._meta

array([], shape=(0, 0, 0), dtype=int8)