# Different options for Cellink API

In [8]:
from pathlib import Path
import sgkit as sg
import anndata as ad
import pandas as pd
import scanpy as sc

import logging

logger = logging.getLogger(__name__)

In [2]:
DATA = Path("/Users/jan.engelmann/projects/sc-eqtl/data")

In [3]:
vcf_file_path = DATA / "OneK1K_imputation_post_qc_r2_08/filter_vcf_r08/chr22.dose.filtered.R2_0.8.vcf.gz"

zarr_path = vcf_file_path.parent.parent / "filter_zarr_r08"
zarr_path.mkdir(exist_ok=True)

In [4]:
icf_file_path = zarr_path / vcf_file_path.with_suffix(".icf").name
zarr_file_path = (zarr_path / vcf_file_path.stem).with_suffix(".vcz")
zarr_file_path

PosixPath('/Users/jan.engelmann/projects/sc-eqtl/data/OneK1K_imputation_post_qc_r2_08/filter_zarr_r08/chr22.dose.filtered.R2_0.8.vcz')

## Convert VCF to Zarr

In [10]:
! vcf2zarr explode {vcf_file_path} {icf_file_path}

    Scan: 100%|███████████████████████████| 1.00/1.00 [00:00<00:00, 1.27files/s]
 Explode: 100%|███████████████████████████| 143k/143k [01:49<00:00, 1.30kvars/s]


In [18]:
! vcf2zarr encode {icf_file_path} {zarr_file_path}

  Encode: 100%|████████████████████████████| 3.11G/3.11G [01:07<00:00, 46.3MB/s]
Finalise: 100%|████████████████████████████| 16.0/16.0 [00:00<00:00, 268array/s]


In [19]:
! vcf2zarr inspect {zarr_file_path}

name                   dtype    stored      size           ratio    nchunks  chunk_size    avg_chunk_stored    shape              chunk_shape       compressor                                                      filters
---------------------  -------  ----------  ----------  --------  ---------  ------------  ------------------  -----------------  ----------------  --------------------------------------------------------------  ------------
/call_GP               float32  105.02 MiB  1.65 GiB      16             30  56.44 MiB     3.5 MiB             (143083, 1034, 3)  (10000, 1000, 3)  Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFLE, blocksize=0)   None
/call_DS               float32  69.89 MiB   564.38 MiB     8.1           30  18.81 MiB     2.33 MiB            (143083, 1034)     (10000, 1000)     Blosc(cname='zstd', clevel=7, shuffle=NOSHUFFLE, blocksize=0)   None
/call_genotype         int8     5.26 MiB    282.19 MiB    54             30  9.41 MiB      179.58 KiB          (143083, 1

## Option 1 – matched single-cell anndata and sgkit dataset

In [6]:
scdata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"
scdata = ad.read_h5ad(scdata_path)
scdata

AnnData object with n_obs × n_vars = 25908 × 32738
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age'
    var: 'GeneSymbol', 'features'

In [7]:
gdata = sg.load_dataset(zarr_file_path)
del gdata.attrs["vcf_header"]
gdata

Unnamed: 0,Array,Chunk
Bytes,564.38 MiB,38.15 MiB
Shape,"(143083, 1034)","(10000, 1000)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 564.38 MiB 38.15 MiB Shape (143083, 1034) (10000, 1000) Dask graph 30 chunks in 2 graph layers Data type float32 numpy.ndarray",1034  143083,

Unnamed: 0,Array,Chunk
Bytes,564.38 MiB,38.15 MiB
Shape,"(143083, 1034)","(10000, 1000)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.65 GiB,114.44 MiB
Shape,"(143083, 1034, 3)","(10000, 1000, 3)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.65 GiB 114.44 MiB Shape (143083, 1034, 3) (10000, 1000, 3) Dask graph 30 chunks in 2 graph layers Data type float32 numpy.ndarray",3  1034  143083,

Unnamed: 0,Array,Chunk
Bytes,1.65 GiB,114.44 MiB
Shape,"(143083, 1034, 3)","(10000, 1000, 3)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,282.19 MiB,19.07 MiB
Shape,"(143083, 1034, 2)","(10000, 1000, 2)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 282.19 MiB 19.07 MiB Shape (143083, 1034, 2) (10000, 1000, 2) Dask graph 30 chunks in 2 graph layers Data type int8 numpy.ndarray",2  1034  143083,

Unnamed: 0,Array,Chunk
Bytes,282.19 MiB,19.07 MiB
Shape,"(143083, 1034, 2)","(10000, 1000, 2)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,282.19 MiB,19.07 MiB
Shape,"(143083, 1034, 2)","(10000, 1000, 2)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 282.19 MiB 19.07 MiB Shape (143083, 1034, 2) (10000, 1000, 2) Dask graph 30 chunks in 2 graph layers Data type bool numpy.ndarray",2  1034  143083,

Unnamed: 0,Array,Chunk
Bytes,282.19 MiB,19.07 MiB
Shape,"(143083, 1034, 2)","(10000, 1000, 2)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,141.09 MiB,9.54 MiB
Shape,"(143083, 1034)","(10000, 1000)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 141.09 MiB 9.54 MiB Shape (143083, 1034) (10000, 1000) Dask graph 30 chunks in 2 graph layers Data type bool numpy.ndarray",1034  143083,

Unnamed: 0,Array,Chunk
Bytes,141.09 MiB,9.54 MiB
Shape,"(143083, 1034)","(10000, 1000)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8 B,8 B
Shape,"(1,)","(1,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 8 B 8 B Shape (1,) (1,) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",1  1,

Unnamed: 0,Array,Chunk
Bytes,8 B,8 B
Shape,"(1,)","(1,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(3,)","(3,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 24 B 24 B Shape (3,) (3,) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",3  1,

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(3,)","(3,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8.08 kiB,7.81 kiB
Shape,"(1034,)","(1000,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 8.08 kiB 7.81 kiB Shape (1034,) (1000,) Dask graph 2 chunks in 2 graph layers Data type object numpy.ndarray",1034  1,

Unnamed: 0,Array,Chunk
Bytes,8.08 kiB,7.81 kiB
Shape,"(1034,)","(1000,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type float32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type float32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type float32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type float32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.18 MiB,156.25 kiB
Shape,"(143083, 2)","(10000, 2)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 2.18 MiB 156.25 kiB Shape (143083, 2) (10000, 2) Dask graph 15 chunks in 2 graph layers Data type object numpy.ndarray",2  143083,

Unnamed: 0,Array,Chunk
Bytes,2.18 MiB,156.25 kiB
Shape,"(143083, 2)","(10000, 2)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,139.73 kiB,9.77 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 139.73 kiB 9.77 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type int8 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,139.73 kiB,9.77 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,419.19 kiB,29.30 kiB
Shape,"(143083, 3)","(10000, 3)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 419.19 kiB 29.30 kiB Shape (143083, 3) (10000, 3) Dask graph 15 chunks in 2 graph layers Data type bool numpy.ndarray",3  143083,

Unnamed: 0,Array,Chunk
Bytes,419.19 kiB,29.30 kiB
Shape,"(143083, 3)","(10000, 3)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.09 MiB,78.12 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 1.09 MiB 78.12 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type object numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,1.09 MiB,78.12 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,139.73 kiB,9.77 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 139.73 kiB 9.77 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type bool numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,139.73 kiB,9.77 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type int32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type float32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


Ideas
- load genetic data into anndata
- use xarray
- have joint datastructure with subsetting

In [9]:

def match_samples(gdata, scdata, g_on="sample_id", sc_on="individual", verbose=False):
    """Match samples between genetic and single-cell data."""
    scdata = scdata[scdata.obs[sc_on].sort_values().index]

    sc_index = pd.Index(scdata.obs[sc_on].unique())
    g_index = pd.Index(gdata[g_on].to_series())
    keep_samples = sc_index.intersection(g_index)
    if verbose:
        all_samples = sc_index.union(g_index)
        logger.warning("Keeping %s/%s samples", len(keep_samples), len(all_samples))
        logger.warning("Dropping %s/%s samples from genetic data", len(g_index) - len(keep_samples), len(g_index))
        logger.warning("Dropping %s/%s samples from single-cell data", len(sc_index) - len(keep_samples), len(sc_index))

    gdata = gdata.assign_coords(samples=gdata[g_on])
    gdata = gdata.sel(samples=keep_samples)
    scdata = scdata[scdata.obs[sc_on].isin(keep_samples)]

    return gdata, scdata


gdata, scdata = match_samples(gdata, scdata, verbose=True)

Keeping 20/1034 samples
Dropping 1014/1034 samples from genetic data
Dropping 0/20 samples from single-cell data


In [5]:
annot = (
    sc.queries.biomart_annotations(
        "hsapiens",
        ["ensembl_gene_id", "start_position", "end_position", "chromosome_name"],
    )
    .set_index("ensembl_gene_id")
    .drop_duplicates()
)
annot

Unnamed: 0_level_0,start_position,end_position,chromosome_name
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000210049,577,647,MT
ENSG00000211459,648,1601,MT
ENSG00000210077,1602,1670,MT
ENSG00000210082,1671,3229,MT
ENSG00000209082,3230,3304,MT
...,...,...,...
ENSG00000232679,221819842,222064773,1
ENSG00000200033,221837334,221837437,1
ENSG00000228437,221966341,221984964,1
ENSG00000229463,235839483,235840182,1


In [10]:
scdata = scdata[:, scdata.var.index.isin(annot.index)]
scdata.var["chrom"] = annot.loc[scdata.var.index, "chromosome_name"].values
scdata.var["start"] = annot.loc[scdata.var.index, "start_position"].values
scdata.var["end"] = annot.loc[scdata.var.index, "end_position"].values
scdata.var

  scdata.var["chrom"] = annot.loc[scdata.var.index, "chromosome_name"].values


Unnamed: 0_level_0,GeneSymbol,features,chrom,start,end
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,MIR1302-10,MIR1302-10,1,29554,31109
ENSG00000237613,FAM138A,FAM138A,1,34554,36081
ENSG00000186092,OR4F5,OR4F5,1,65419,71585
ENSG00000238009,RP11-34P13.7,RP11-34P13.7,1,89295,133566
ENSG00000239945,RP11-34P13.8,RP11-34P13.8,1,89551,91105
...,...,...,...,...,...
ENSG00000212907,MT-ND4L,MT-ND4L,MT,10470,10766
ENSG00000198886,MT-ND4,MT-ND4,MT,10760,12137
ENSG00000198786,MT-ND5,MT-ND5,MT,12337,14148
ENSG00000198695,MT-ND6,MT-ND6,MT,14149,14673


In [11]:
def match_regions(scdata, gdata, sc_on="chrom", verbose=False):
    """Filter SNPs and genes to same chromosome"""
    idx_to_chrom = gdata.contig_id.to_series()  # idx, chrom
    sc_regions = set(scdata.var[sc_on].unique())
    keep_regions = sc_regions.intersection(idx_to_chrom.values)

    keep_genes = scdata.var[sc_on].isin(keep_regions)

    keep_snps = gdata.variant_contig.isin(idx_to_chrom[idx_to_chrom.isin(keep_regions)].index).data.compute()
    if verbose:
        logger.warning("Keeping %s/%s genes", keep_genes.sum(), len(keep_genes))
        logger.warning("Keeping %s/%s snps", keep_snps.sum(), len(keep_snps))

    scdata = scdata[:, keep_genes]
    gdata = gdata.isel(variants=keep_snps)
    return scdata, gdata


scdata, gdata = match_regions(scdata, gdata, verbose=True)

Keeping 666/30774 genes
Keeping 143083/143083 snps


## Option 2 – Both single cell and genetic data in anndata

In [12]:
scdata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"
scdata = ad.read_h5ad(scdata_path)

_gdata = sg.load_dataset(zarr_file_path)
del _gdata.attrs["vcf_header"]
_gdata

Unnamed: 0,Array,Chunk
Bytes,564.38 MiB,38.15 MiB
Shape,"(143083, 1034)","(10000, 1000)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 564.38 MiB 38.15 MiB Shape (143083, 1034) (10000, 1000) Dask graph 30 chunks in 2 graph layers Data type float32 numpy.ndarray",1034  143083,

Unnamed: 0,Array,Chunk
Bytes,564.38 MiB,38.15 MiB
Shape,"(143083, 1034)","(10000, 1000)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.65 GiB,114.44 MiB
Shape,"(143083, 1034, 3)","(10000, 1000, 3)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 1.65 GiB 114.44 MiB Shape (143083, 1034, 3) (10000, 1000, 3) Dask graph 30 chunks in 2 graph layers Data type float32 numpy.ndarray",3  1034  143083,

Unnamed: 0,Array,Chunk
Bytes,1.65 GiB,114.44 MiB
Shape,"(143083, 1034, 3)","(10000, 1000, 3)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,282.19 MiB,19.07 MiB
Shape,"(143083, 1034, 2)","(10000, 1000, 2)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 282.19 MiB 19.07 MiB Shape (143083, 1034, 2) (10000, 1000, 2) Dask graph 30 chunks in 2 graph layers Data type int8 numpy.ndarray",2  1034  143083,

Unnamed: 0,Array,Chunk
Bytes,282.19 MiB,19.07 MiB
Shape,"(143083, 1034, 2)","(10000, 1000, 2)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,282.19 MiB,19.07 MiB
Shape,"(143083, 1034, 2)","(10000, 1000, 2)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 282.19 MiB 19.07 MiB Shape (143083, 1034, 2) (10000, 1000, 2) Dask graph 30 chunks in 2 graph layers Data type bool numpy.ndarray",2  1034  143083,

Unnamed: 0,Array,Chunk
Bytes,282.19 MiB,19.07 MiB
Shape,"(143083, 1034, 2)","(10000, 1000, 2)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,141.09 MiB,9.54 MiB
Shape,"(143083, 1034)","(10000, 1000)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 141.09 MiB 9.54 MiB Shape (143083, 1034) (10000, 1000) Dask graph 30 chunks in 2 graph layers Data type bool numpy.ndarray",1034  143083,

Unnamed: 0,Array,Chunk
Bytes,141.09 MiB,9.54 MiB
Shape,"(143083, 1034)","(10000, 1000)"
Dask graph,30 chunks in 2 graph layers,30 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8 B,8 B
Shape,"(1,)","(1,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 8 B 8 B Shape (1,) (1,) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",1  1,

Unnamed: 0,Array,Chunk
Bytes,8 B,8 B
Shape,"(1,)","(1,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(3,)","(3,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 24 B 24 B Shape (3,) (3,) Dask graph 1 chunks in 2 graph layers Data type object numpy.ndarray",3  1,

Unnamed: 0,Array,Chunk
Bytes,24 B,24 B
Shape,"(3,)","(3,)"
Dask graph,1 chunks in 2 graph layers,1 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,8.08 kiB,7.81 kiB
Shape,"(1034,)","(1000,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 8.08 kiB 7.81 kiB Shape (1034,) (1000,) Dask graph 2 chunks in 2 graph layers Data type object numpy.ndarray",1034  1,

Unnamed: 0,Array,Chunk
Bytes,8.08 kiB,7.81 kiB
Shape,"(1034,)","(1000,)"
Dask graph,2 chunks in 2 graph layers,2 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type float32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type float32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type float32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type float32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.18 MiB,156.25 kiB
Shape,"(143083, 2)","(10000, 2)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 2.18 MiB 156.25 kiB Shape (143083, 2) (10000, 2) Dask graph 15 chunks in 2 graph layers Data type object numpy.ndarray",2  143083,

Unnamed: 0,Array,Chunk
Bytes,2.18 MiB,156.25 kiB
Shape,"(143083, 2)","(10000, 2)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,139.73 kiB,9.77 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray
"Array Chunk Bytes 139.73 kiB 9.77 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type int8 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,139.73 kiB,9.77 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,int8 numpy.ndarray,int8 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,419.19 kiB,29.30 kiB
Shape,"(143083, 3)","(10000, 3)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 419.19 kiB 29.30 kiB Shape (143083, 3) (10000, 3) Dask graph 15 chunks in 2 graph layers Data type bool numpy.ndarray",3  143083,

Unnamed: 0,Array,Chunk
Bytes,419.19 kiB,29.30 kiB
Shape,"(143083, 3)","(10000, 3)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.09 MiB,78.12 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray
"Array Chunk Bytes 1.09 MiB 78.12 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type object numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,1.09 MiB,78.12 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,object numpy.ndarray,object numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,139.73 kiB,9.77 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray
"Array Chunk Bytes 139.73 kiB 9.77 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type bool numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,139.73 kiB,9.77 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,bool numpy.ndarray,bool numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type int32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,int32 numpy.ndarray,int32 numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray
"Array Chunk Bytes 558.92 kiB 39.06 kiB Shape (143083,) (10000,) Dask graph 15 chunks in 2 graph layers Data type float32 numpy.ndarray",143083  1,

Unnamed: 0,Array,Chunk
Bytes,558.92 kiB,39.06 kiB
Shape,"(143083,)","(10000,)"
Dask graph,15 chunks in 2 graph layers,15 chunks in 2 graph layers
Data type,float32 numpy.ndarray,float32 numpy.ndarray


In [44]:
X = _gdata.call_genotype.data.sum(-1).T
obs =  pd.DataFrame(index=_gdata.sample_id.data.compute())
var = {}
for k, v in _gdata.variables.items():
    if len(v.shape) == 1 and v.shape[0] == _gdata.variants.shape[0]:
        var[k.replace("variant_", "")] = v.data.compute()
var = pd.DataFrame(var)
var["chrom"] = _gdata.variant_contig.to_series().map({i: v for i, v in enumerate(_gdata.contig_id.data.compute())}).values
var["a0"] = _gdata.variant_allele[:, 0].data.compute().ravel()
var["a1"] = _gdata.variant_allele[:, 1].data.compute().ravel()
var.index = var["chrom"].astype(str) + "_" + var["position"].astype(str) + "_" + var["a0"].astype(str) + "_" + var["a1"].astype(str)
var.index.name = "variant_id"
first_cols = ['chrom', 'position', 'a0', 'a1']
var = var[first_cols + [c for c in var.columns if c not in first_cols]]
varm = {"filter": pd.DataFrame(_gdata.variant_filter.data.compute(), index=var.index, columns=_gdata.filter_id.data.compute())}
gdata = ad.AnnData(X=X, obs=obs, var=var, varm=varm)
gdata

AnnData object with n_obs × n_vars = 1034 × 143083
    var: 'chrom', 'position', 'a0', 'a1', 'AF', 'ER2', 'MAF', 'R2', 'contig', 'id', 'id_mask', 'quality'
    varm: 'filter'

In [46]:
def match_samples(gdata, scdata, sc_on="individual", verbose=False):
    """Match samples between genetic and single-cell data."""
    scdata = scdata[scdata.obs[sc_on].sort_values().index]

    sc_index = pd.Index(scdata.obs[sc_on].unique())
    g_index = gdata.obs.index
    keep_samples = sc_index.intersection(g_index)
    if verbose:
        all_samples = sc_index.union(g_index)
        logger.warning("Keeping %s/%s samples", len(keep_samples), len(all_samples))
        logger.warning("Dropping %s/%s samples from genetic data", len(g_index) - len(keep_samples), len(g_index))
        logger.warning("Dropping %s/%s samples from single-cell data", len(sc_index) - len(keep_samples), len(sc_index))

    gdata = gdata[keep_samples]
    scdata = scdata[scdata.obs[sc_on].isin(keep_samples)]

    return gdata, scdata


gdata, scdata = match_samples(gdata, scdata, verbose=True)

Keeping 20/1034 samples
Dropping 1014/1034 samples from genetic data
Dropping 0/20 samples from single-cell data


In [47]:
scdata = scdata[:, scdata.var.index.isin(annot.index)]
scdata.var["chrom"] = annot.loc[scdata.var.index, "chromosome_name"].values
scdata.var["start"] = annot.loc[scdata.var.index, "start_position"].values
scdata.var["end"] = annot.loc[scdata.var.index, "end_position"].values
scdata.var

  scdata.var["chrom"] = annot.loc[scdata.var.index, "chromosome_name"].values


Unnamed: 0_level_0,GeneSymbol,features,chrom,start,end
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,MIR1302-10,MIR1302-10,1,29554,31109
ENSG00000237613,FAM138A,FAM138A,1,34554,36081
ENSG00000186092,OR4F5,OR4F5,1,65419,71585
ENSG00000238009,RP11-34P13.7,RP11-34P13.7,1,89295,133566
ENSG00000239945,RP11-34P13.8,RP11-34P13.8,1,89551,91105
...,...,...,...,...,...
ENSG00000212907,MT-ND4L,MT-ND4L,MT,10470,10766
ENSG00000198886,MT-ND4,MT-ND4,MT,10760,12137
ENSG00000198786,MT-ND5,MT-ND5,MT,12337,14148
ENSG00000198695,MT-ND6,MT-ND6,MT,14149,14673


In [48]:
def match_regions(scdata, gdata, sc_on="chrom", verbose=False):
    """Filter SNPs and genes to same chromosome"""
    idx_to_chrom = gdata.var.chrom  # idx, chrom
    sc_regions = set(scdata.var[sc_on].unique())
    keep_regions = sc_regions.intersection(idx_to_chrom.values)

    keep_genes = scdata.var[sc_on].isin(keep_regions)

    keep_snps = gdata.var.chrom.isin(keep_regions)
    if verbose:
        logger.warning("Keeping %s/%s genes", keep_genes.sum(), len(keep_genes))
        logger.warning("Keeping %s/%s snps", keep_snps.sum(), len(keep_snps))

    scdata = scdata[:, keep_genes]
    gdata = gdata[:, keep_snps]
    return scdata, gdata


scdata, gdata = match_regions(scdata, gdata, verbose=True)

Keeping 666/30774 genes
Keeping 143083/143083 snps
