In [2]:
from pathlib import Path

import anndata as ad
import cellink as cl

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA = Path(cl.__file__).parent.parent.parent / "docs/tutorials/data"
print(DATA)
GENODATA = DATA / "eqtl_cat_genotypes"

gpc_path = GENODATA / "pcdir/OneK1K.noGP.filtered.pruned.eigenvec"
adata_path = DATA / "onk1k_cellxgene.h5ad"
gdata_path = GENODATA / "sample.vcz"
adata_path_out = DATA / "onk1k_cellxgene_donor_mapped.h5ad.gz"
adata_path_out_test = DATA / "onk1k_cellxgene_donor_mapped_cd4_naive.h5ad.gz"

/data/nasif12/home_if12/hoev/git/sc-genetics/docs/tutorials/data


## Get the data 
 
wget https://datasets.cellxgene.cziscience.com/81d84489-bff9-4fb6-b0ee-78348126eada.h5ad # move to DATA / onk1k_cellxgene.h5ad 

wget https://zenodo.org/records/7619796/files/OneK1K.noGP.vcf.gz.csi?download=1 # mv to GENODATA / OneK1K.noGP.vcf.gz.csi

wget https://zenodo.org/records/7619796/files/OneK1K.noGP.vcf.gz?download=1 # mv to GENODATA / OneK1K.noGP.vcf.gz

wget https://zenodo.org/records/7619796/files/gene_counts_Ensembl_105_phenotype_metadata.tsv.gz?download=1 # mv to DATA / gene_annotation.csv


## Prepare genotypes 

## genotypes for chr22 
`cd GENODATA`

`bcftools view -r 22 OneK1K.noGP.vcf.gz -Oz -o OneK1K.noGP_chr22.vcf.gz`

`vcf2zarr explode OneK1K.noGP_chr22.vcf.gz  sample.icf`

`vcf2zarr encode sample.icf sample.vcz`

## genetic PCs and Kinship
`python ./create_gdata/01_vcf_to_plink.py`

`python ./create_gdata/02_geno_pcs.py`

`python ./create_gdata/03_kinship.py`


## Prepare sc data 

In [4]:
gdata = cl.io.read_sgkit_zarr(gdata_path)

In [5]:
adata = ad.read_h5ad(adata_path)

In [6]:
adata.obs["donor_id"] = "OneK1K_" + adata.obs["donor_id"].str.split("_").str[1]

In [7]:
len(set(gdata.obs.index) - set(adata.obs["donor_id"]))

117

In [8]:
len(gdata.obs.index) - len(adata.obs["donor_id"].drop_duplicates())

117

In [None]:
adata.obs["sex"] = adata.obs["sex"].map({"male": 1, "female": 0}).astype(int)
adata.obs["sex"].unique()

array([1, 0])

In [16]:
adata.obs["age"] = adata.obs["age"].astype("int")

In [17]:
adata.obs[["age", "sex"]].dtypes

age    int64
sex    int64
dtype: object

In [18]:
adata.write(adata_path_out, compression="gzip")
adata

... storing 'donor_id' as categorical


AnnData object with n_obs × n_vars = 1248980 × 36469
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'donor_id', 'pool_number', 'predicted.celltype.l2', 'predicted.celltype.l2.score', 'age', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'cell_type_ontology_term_id_colors', 'citation', 'default_embedding', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_azi

In [19]:
cell_type = "CD4 Naive"
celltype_key = "predicted.celltype.l2"

In [20]:
adata_test = adata[adata.obs[celltype_key] == cell_type].copy()
adata_test

AnnData object with n_obs × n_vars = 259012 × 36469
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'donor_id', 'pool_number', 'predicted.celltype.l2', 'predicted.celltype.l2.score', 'age', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'cell_type_ontology_term_id_colors', 'citation', 'default_embedding', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_azim

In [21]:
adata.write(adata_path_out_test, compression="gzip")
adata

AnnData object with n_obs × n_vars = 1248980 × 36469
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'donor_id', 'pool_number', 'predicted.celltype.l2', 'predicted.celltype.l2.score', 'age', 'organism_ontology_term_id', 'tissue_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'suspension_type', 'tissue_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage', 'observation_joinid'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable', 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype', 'feature_length', 'feature_type'
    uns: 'cell_type_ontology_term_id_colors', 'citation', 'default_embedding', 'schema_reference', 'schema_version', 'title'
    obsm: 'X_azi