In [1]:
import pandas as pd
import numpy as np
import anndata as ad
import sgkit as sg
import cellink  as cl
from pathlib import Path

In [2]:
from cellink.io import read_sgkit_zarr

In [4]:
zarr_file_path = "/data/ouga/home/ag_gagneur/hoev/s_deeprvat/eva/theislab/hackathon24/chr22.dose.filtered.R2_0.8.vcz"
#genotype zarr file

## Load data as zarr file 

In [5]:
gdata = read_sgkit_zarr(zarr_file_path)
gdata

AnnData object with n_obs × n_vars = 1034 × 143083
    obs: 'id'
    var: 'chrom', 'pos', 'a0', 'a1', 'AF', 'ER2', 'maf', 'R2', 'contig', 'id', 'id_mask', 'quality'
    varm: 'filter'

In [4]:
# function for this to be merged soon 
X = _gdata.call_genotype.data.sum(-1).T
print(X.shape)
obs =  pd.DataFrame(index=_gdata.sample_id.data.compute())
var = {}
for k, v in _gdata.variables.items():
    if len(v.shape) == 1 and v.shape[0] == _gdata.variants.shape[0]:
        print(k)
        var[k.replace("variant_", "")] = v.data.compute()
var = pd.DataFrame(var)
var["chrom"] = _gdata.variant_contig.to_series().map({i: v for i, v in enumerate(_gdata.contig_id.data.compute())}).values
var["a0"] = _gdata.variant_allele[:, 0].data.compute().ravel()
var["a1"] = _gdata.variant_allele[:, 1].data.compute().ravel()
var.index = var["chrom"].astype(str) + "_" + var["position"].astype(str) + "_" + var["a0"].astype(str) + "_" + var["a1"].astype(str)
var.index.name = "variant_id"
first_cols = ['chrom', 'position', 'a0', 'a1']
var = var[first_cols + [c for c in var.columns if c not in first_cols]]
varm = {"filter": pd.DataFrame(_gdata.variant_filter.data.compute(), index=var.index, columns=_gdata.filter_id.data.compute())}
gdata = ad.AnnData(X=X, obs=obs, var=var, varm=varm)
gdata

(1034, 143083)
variant_AF
variant_ER2
variant_MAF
variant_R2
variant_contig
variant_id
variant_id_mask
variant_position
variant_quality


AnnData object with n_obs × n_vars = 1034 × 143083
    var: 'chrom', 'position', 'a0', 'a1', 'AF', 'ER2', 'MAF', 'R2', 'contig', 'id', 'id_mask', 'quality'
    varm: 'filter'

## Write variants to VCF for annotation 

In [6]:
cl.tl.write_variants_to_vcf(gdata, out_file="variants.vcf")

[2024-10-09 10:13:45,906] INFO:cellink.tl._annotate_snps_genotype_data: number of variants to annotate: 143083
[2024-10-09 10:13:45,907] INFO:cellink.tl._annotate_snps_genotype_data: Writing variants to variants.vcf


## Annotate variants

### Using VEP 

### run VEP

In [7]:
anno_file = "variant_vep_annotated.txt"
config_file  = "../docs/example/config.yaml"


In [7]:
# cl.tl.run_vep(config_file,
#              input_vcf="variants.vcf",
#              output=anno_file) #writes "variant_vep_annotated.txt"


### process VEP annos

In [8]:
def read_vep_annos(vep_anno_file, cols_to_explode=["Consequence"],
                  cols_to_dummy=["Consequence"]):
    
    annos = pd.read_csv(vep_anno_file,
                        sep = '\t', 
                skiprows = _get_vep_start_row(anno_file))
    logger.info(f"{annos.columns}")
    annos.replace('-', np.nan, inplace=True)
    for col in cols_to_explode:
        annos = _explode_columns(annos, col)
    for col in cols_to_dummy:
        annos = _add_dummy_cols(annos, col)
    #TODO: make function to collapse such that only one row per variant
    return(annos)


In [9]:
annos = cl.tl.read_vep_annos(anno_file, 
                             cols_to_explode=["Consequence"],
                             cols_to_dummy=["Consequence"])
annos

[2024-10-09 10:14:07,682] INFO:cellink.tl._annotate_snps_genotype_data: Index(['#Uploaded_variation', 'Location', 'Allele', 'Gene', 'Feature',
       'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position',
       'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation',
       'IMPACT', 'DISTANCE', 'STRAND', 'FLAGS', 'BIOTYPE', 'CANONICAL', 'ENSP',
       'SIFT', 'PolyPhen', 'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF',
       'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF',
       'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'CLIN_SIG', 'SOMATIC', 'PHENO',
       'CADD_PHRED', 'CADD_RAW', 'TSSDistance'],
      dtype='object')
[2024-10-09 10:14:08,004] INFO:cellink.tl.utils: Exploding column Consequence
[2024-10-09 10:14:08,540] INFO:cellink.tl.utils: Number of original rows: 182552.
 Number of exploded rows: 200116
[2024-10-09 10:14:08,553] INFO:cellink.tl.utils: Making dummies from column Consequence
[2024-10-09 10:14:08,748] INFO:cellink.tl._annota

  annos_cond.index.to_frame()[annos_cond.index.value_counts() > 1].index.unique()
  annos_cond_sub = annos_cond_sub.applymap(


Unnamed: 0_level_0,Location,Allele,Gene,Feature,Feature_type,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,...,Consequence_splice_donor_5th_base_variant,Consequence_splice_donor_region_variant,Consequence_splice_donor_variant,Consequence_splice_polypyrimidine_tract_variant,Consequence_splice_region_variant,Consequence_start_lost,Consequence_stop_gained,Consequence_stop_retained_variant,Consequence_synonymous_variant,Consequence_upstream_gene_variant
#Uploaded_variation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22_47192595_G/A,22:47192595,A,ENSG00000054611,ENST00000337137,Transcript,,,,,,...,0,0,0,0,0,0,0,0,0,0
22_27004964_G/A,22:27004964,A,ENSG00000100122,ENST00000215939,Transcript,,,,,,...,0,0,0,0,0,0,0,0,0,0
22_34890077_C/G,22:34890077,G,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
22_46931077_G/C,22:46931077,C,ENSG00000075275,ENST00000262738,Transcript,1991/11389,1991/9045,664/3014,S/W,tCg/tGg,...,0,0,0,0,0,0,0,0,0,0
22_35976572_A/G,22:35976572,G,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22_51202748_A/G,22:51202748,G,"[ENSG00000079974, ENSG00000184319, ENSG0000018...","[ENST00000395593, ENST00000496652, ENST0000049...",Transcript,,,,,,...,0,0,0,0,0,0,0,0,0,0
22_51208568_G/T,22:51208568,T,"[ENSG00000184319, ENSG00000184319, ENSG0000007...","[ENST00000496652, ENST00000496652, ENST0000039...",Transcript,,,,,,...,0,0,0,0,0,0,0,0,0,0
22_51211031_A/G,22:51211031,G,"[ENSG00000184319, ENSG00000184319, ENSG0000007...","[ENST00000496652, ENST00000496652, ENST0000039...",Transcript,,,,,,...,0,0,0,0,0,0,0,0,0,0
22_51213613_C/T,22:51213613,T,"[ENSG00000079974, ENSG00000184319, ENSG0000018...","[ENST00000395593, ENST00000496652, ENST0000049...",Transcript,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [10]:
gdata = cl.tl.merge_annos_into_gdata(annos, gdata)
print(gdata.var.columns)

[2024-10-09 10:14:37,060] INFO:cellink.tl._annotate_snps_genotype_data: Joining gdata.var with annos on index
Index(['chrom', 'pos', 'a0', 'a1', 'AF', 'ER2', 'maf', 'R2', 'contig', 'id',
       'id_mask', 'quality', 'Location', 'Allele', 'Gene', 'Feature',
       'Feature_type', 'cDNA_position', 'CDS_position', 'Protein_position',
       'Amino_acids', 'Codons', 'Existing_variation', 'IMPACT', 'DISTANCE',
       'STRAND', 'FLAGS', 'BIOTYPE', 'CANONICAL', 'ENSP', 'SIFT', 'PolyPhen',
       'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF',
       'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF', 'gnomADe_OTH_AF',
       'gnomADe_SAS_AF', 'CLIN_SIG', 'SOMATIC', 'PHENO', 'CADD_PHRED',
       'CADD_RAW', 'TSSDistance', 'Consequence_3_prime_UTR_variant',
       'Consequence_5_prime_UTR_variant', 'Consequence_NMD_transcript_variant',
       'Consequence_coding_sequence_variant',
       'Consequence_downstream_gene_variant',
       'Consequence_incomplete_terminal_codon_v