In [1]:
import pandas as pd
import numpy as np
import anndata as ad
import sgkit as sg
import cellink  as cl
from pathlib import Path

In [2]:
from cellink.io import read_sgkit_zarr

In [3]:
id_col = "#Uploaded_variation"

In [4]:
zarr_file_path = "/data/ouga/home/ag_gagneur/hoev/s_deeprvat/eva/theislab/hackathon24/chr22.dose.filtered.R2_0.8.vcz"
#genotype zarr file

## Load data as zarr file 

In [5]:
gdata = read_sgkit_zarr(zarr_file_path)
gdata

AnnData object with n_obs × n_vars = 1034 × 143083
    obs: 'id'
    var: 'chrom', 'pos', 'a0', 'a1', 'AF', 'ER2', 'maf', 'R2', 'contig', 'id', 'id_mask', 'quality'
    varm: 'filter'

## Write variants to VCF for annotation 

In [6]:
cl.tl.write_variants_to_vcf(gdata, out_file="variants.vcf")

[2024-10-09 15:43:33,181] INFO:cellink.tl._annotate_snps_genotype_data: number of variants to annotate: 143083
[2024-10-09 15:43:33,182] INFO:cellink.tl._annotate_snps_genotype_data: Writing variants to variants.vcf


## Annotate variants

### Using VEP 

### run VEP

In [7]:
anno_file = "variant_vep_annotated.txt"
config_file  = "../docs/example/config.yaml"


In [8]:
# cl.tl.run_vep(config_file,
#              input_vcf="variants.vcf",
#              output=anno_file) #writes "variant_vep_annotated.txt"


### process VEP annos

In [9]:
%%time
gdata = cl.tl.add_vep_annos_to_gdata(anno_file, gdata,
                                    id_col = "#Uploaded_variation",
                             cols_to_explode=["Consequence"],
                             cols_to_dummy=["Consequence"])
gdata

[2024-10-09 15:43:33,946] INFO:cellink.tl._annotate_snps_genotype_data: Index(['#Uploaded_variation', 'Location', 'Allele', 'Gene', 'Feature',
       'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position',
       'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation',
       'IMPACT', 'DISTANCE', 'STRAND', 'FLAGS', 'BIOTYPE', 'CANONICAL', 'ENSP',
       'SIFT', 'PolyPhen', 'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF',
       'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF',
       'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'CLIN_SIG', 'SOMATIC', 'PHENO',
       'CADD_PHRED', 'CADD_RAW', 'TSSDistance'],
      dtype='object')
[2024-10-09 15:43:34,262] INFO:cellink.tl.utils: Exploding column Consequence
[2024-10-09 15:43:34,774] INFO:cellink.tl.utils: Number of original rows: 182552.
 Number of exploded rows: 200116
[2024-10-09 15:43:34,786] INFO:cellink.tl.utils: Making dummies from column Consequence
[2024-10-09 15:43:34,980] INFO:cellink.tl._annota

AnnData object with n_obs × n_vars = 1034 × 143083
    obs: 'id'
    var: 'chrom', 'pos', 'a0', 'a1', 'AF', 'ER2', 'maf', 'R2', 'contig', 'id', 'id_mask', 'quality'
    varm: 'annotations_0', 'annotations_1', 'annotations_2', 'annotations_3', 'annotations_4', 'annotations_5', 'annotations_6', 'annotations_7'

## Save gdata with annotations (current workaround because of some dtype issues with some columns)

In [24]:
# current workaround for saving 
import pickle
with open("gdata_varm.pkl", "wb") as f:
    pickle.dump(gdata.varm, f)
del gdata.varm
gdata.write("gdata.h5ad")

## Load data 

In [27]:
gdata = ad.read_h5ad("gdata.h5ad")
with open("gdata_varm.pkl", "rb") as f:
    varm = pickle.load(f)

gdata.varm = varm

AnnData object with n_obs × n_vars = 1034 × 143083
    obs: 'id'
    var: 'chrom', 'pos', 'a0', 'a1', 'AF', 'ER2', 'maf', 'R2', 'contig', 'id', 'id_mask', 'quality'