# Rare variant association testing 

In [25]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [26]:
import pandas as pd

In [27]:
import gc
from pathlib import Path
import warnings

import anndata as ad
import scanpy as sc
import dask.array as da
import numpy as np
from tqdm.auto import tqdm

import cellink as cl
from cellink._core import DAnn, GAnn
from cellink.tl._rvat import burden_test
from cellink.utils import column_normalize, gaussianize

In [28]:
DATA = Path(cl.__file__).parent.parent.parent / "data"
DATA = Path("/s/project/sys_gen_students/2024_2025/project04_rare_variant_sc/input_data")
GENODATA = DATA  # / "OneK1K_imputation_post_qc_r2_08"

gpc_path = GENODATA / "pcdir/wgs.dose.filtered.R2_0.8.filtered.pruned.eigenvec"
adata_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"
gdata_path = GENODATA / "filter_vcf_r08/chr22.dose.filtered.R2_0.8.vcz"

In [29]:
n_gpcs = 20
n_epcs = 15
batch_e_pcs_n_top_genes = 2000
chrom = 22
cis_window = 100_000
cell_type = "CD8 NC"
pb_gex_key = f"PB_{cell_type}"  # pseudobulk expression in dd.G.obsm[key_added]
original_donor_col = "individual"
min_percent_donors_expressed = 0.1
celltype_key = "cell_label"
do_debug = False

## Prepare data 

In [30]:
if do_debug:
    adata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"

adata = ad.read_h5ad(adata_path)
gdata = cl.io.read_sgkit_zarr(gdata_path)

gene_ann = pd.read_csv(DATA / "gene_annotation.csv").set_index("ensembl_gene_id")
adata.var = pd.concat([adata.var, gene_ann.loc[adata.var.index]], axis=1).rename(
    columns={
        "start_position": GAnn.start,
        "end_position": GAnn.end,
        "chromosome_name": GAnn.chrom,
    }
)
adata.obs[DAnn.donor] = adata.obs[original_donor_col]
adata

AnnData object with n_obs × n_vars = 1272489 × 32738
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age', 'donor_id'
    var: 'GeneSymbol', 'features', 'start', 'end', 'chrom', 'strand', 'description', 'wikigene_name', 'wikigene_id'

In [31]:
dd = cl.DonorData(G=gdata, C=adata).copy()  # copy to avoid view warnings
dd



### prepare sc data 

In [32]:
sc.pp.normalize_total(dd.C)
sc.pp.log1p(dd.C)
sc.pp.normalize_total(dd.C)

# are the expression pcs computed by pseudobulking across all cell types?
mdata = sc.get.aggregate(dd.C, by=DAnn.donor, func="mean")
mdata.X = mdata.layers.pop("mean")

sc.pp.highly_variable_genes(mdata, n_top_genes=batch_e_pcs_n_top_genes)
sc.tl.pca(mdata, n_comps=n_epcs)

dd.G.obsm["ePCs"] = mdata[dd.G.obs_names].obsm["X_pca"]

In [33]:
dd = dd[..., dd.C.obs[celltype_key] == cell_type, :].copy()
dd



In [34]:
gc.collect()

1665

In [35]:
dd.aggregate(key_added=pb_gex_key, sync_var=True, verbose=True)
dd.aggregate(obs=["sex", "age"], func="first", add_to_obs=True)
dd

[2025-04-22 12:25:24,485] INFO:cellink._core.donordata: Aggregated X to PB_CD8 NC
[2025-04-22 12:25:24,486] INFO:cellink._core.donordata: Observation found for 981 donors.




In [36]:
dd.G.obsm[pb_gex_key].shape

(981, 32738)

In [37]:
gpcs = pd.read_csv(gpc_path, sep=r"\s+", index_col=1, header=None).drop(columns=0)
dd.G.obsm["gPCs"] = gpcs.loc[dd.G.obs_names].iloc[:, :n_gpcs]

In [38]:
print(f"{pb_gex_key} shape:", dd.G.obsm[pb_gex_key].shape)
print("dd.shape:", dd.shape)

keep_genes = ((dd.G.obsm[pb_gex_key] > 0).mean(axis=0) >= min_percent_donors_expressed).values
dd = dd[..., keep_genes]
print("after filtering")
print(f"{pb_gex_key} shape:", dd.G.obsm[pb_gex_key].shape)
print("dd.shape:", dd.shape)

PB_CD8 NC shape: (981, 32738)
dd.shape: (981, 143083, 133482, 32738)
after filtering
PB_CD8 NC shape: (981, 14119)
dd.shape: (981, 143083, 133482, 14119)


In [39]:
F = np.concatenate(
    [
        np.ones((dd.shape[0], 1)),
        dd.G.obs[["sex"]].values - 1,
        dd.G.obs[["age"]].values,
        dd.G.obsm["gPCs"].values,
        dd.G.obsm["ePCs"],
    ],
    axis=1,
)
F[:, 2:] = column_normalize(F[:, 2:])

In [40]:
chrom

22

In [41]:
# alternative to dd[:, dd.G.var.chrom == str(chrom), :, dd.C.var.chrom == str(chrom)]
dd = dd.sel(G_var=dd.G.var.chrom == str(chrom), C_var=dd.C.var.chrom == str(chrom)).copy()
dd



### Adding variant annotations to `dd`

In [42]:
vep_annotation_file = "/data/nasif12/home_if12/hoev/git/sc-genetics/tests/data/variants_vep_annotated_all_ch22.txt"

In [43]:
cl.tl.add_vep_annos_to_gdata(vep_anno_file=vep_annotation_file, gdata=dd.G, dummy_consequence=True)
dd.G.uns["variant_annotation_vep"]

[2025-04-22 12:25:26,514] INFO:cellink.tl._annotate_snps_genotype_data: Preparing VEP annotations for addition to gdata
[2025-04-22 12:25:26,514] INFO:cellink.tl._annotate_snps_genotype_data: Reading annotation file /data/nasif12/home_if12/hoev/git/sc-genetics/tests/data/variants_vep_annotated_all_ch22.txt


[2025-04-22 12:25:26,932] INFO:cellink.tl._annotate_snps_genotype_data: Annotation file loaded
[2025-04-22 12:25:26,958] INFO:cellink.tl._annotate_snps_genotype_data: Annotation columns: ['snp_id', 'Location', 'Allele', 'gene_id', 'transcript_id', 'Feature_type', 'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position', 'Amino_acids', 'Codons', 'Existing_variation', 'IMPACT', 'DISTANCE', 'STRAND', 'FLAGS', 'BIOTYPE', 'CANONICAL', 'ENSP', 'SIFT', 'PolyPhen', 'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF', 'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF', 'gnomADe_OTH_AF', 'gnomADe_SAS_AF', 'CLIN_SIG', 'SOMATIC', 'PHENO', 'CADD_PHRED', 'CADD_RAW', 'TSSDistance']
[2025-04-22 12:25:29,054] INFO:cellink.tl._annotate_snps_genotype_data: Changing dtype of categorical columns ['snp_id', 'gene_id', 'transcript_id', 'STRAND', 'gnomADe_ASJ_AF', 'BIOTYPE', 'gnomADe_NFE_AF', 'Protein_position', 'gnomADe_OTH_AF', 'IMPACT', 'CDS_position', 'Existing_variation', 'PHEN

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Consequence_3_prime_UTR_variant,Consequence_5_prime_UTR_variant,Consequence_NMD_transcript_variant,Consequence_coding_sequence_variant,Consequence_downstream_gene_variant,Consequence_incomplete_terminal_codon_variant,Consequence_intergenic_variant,Consequence_intron_variant,Consequence_mature_miRNA_variant,Consequence_missense_variant,...,Codons,CADD_PHRED,Amino_acids,SOMATIC,gnomADe_AMR_AF,DISTANCE,gnomADe_FIN_AF,FLAGS,gnomADe_SAS_AF,gnomADe_AFR_AF
snp_id,gene_id,transcript_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
22_16849573_A_G,-,-,0,0,0,0,0,0,1,0,0,0,...,-,8.747,-,-,,,,-,,
22_16849971_A_T,-,-,0,0,0,0,0,0,1,0,0,0,...,-,8.843,-,-,,,,-,,
22_16850437_G_A,-,-,0,0,0,0,0,0,1,0,0,0,...,-,8.063,-,-,,,,-,,
22_16851225_C_T,-,-,0,0,0,0,0,0,1,0,0,0,...,-,8.324,-,-,,,,-,,
22_16851356_C_T,-,-,0,0,0,0,0,0,1,0,0,0,...,-,8.148,-,-,,,,-,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22_51211031_A_G,ENSG00000184319,ENST00000496652,0,0,0,0,0,0,0,1,0,0,...,-,5.127,-,-,,,,-,,
22_51213613_C_T,ENSG00000184319,ENST00000496652,0,0,0,0,0,0,0,1,0,0,...,-,0.190,-,-,,,,-,,
22_51213613_C_T,ENSG00000079974,ENST00000395593,0,0,0,0,0,0,0,1,0,0,...,-,0.190,-,-,,,,-,,
22_51216564_T_C,ENSG00000184319,ENST00000496652,0,0,0,0,0,0,0,1,0,0,...,-,1.282,-,-,,,,-,,


In [44]:
cl.tl.aggregate_annotations_for_varm(
    dd.G, "variant_annotation_vep", agg_type="first", return_data=True
)  # TODO change agg type

[2025-04-22 12:25:30,086] INFO:cellink.tl._annotate_snps_genotype_data: Aggregating using method: first


Unnamed: 0_level_0,gene_id,transcript_id,Consequence_3_prime_UTR_variant,Consequence_5_prime_UTR_variant,Consequence_NMD_transcript_variant,Consequence_coding_sequence_variant,Consequence_downstream_gene_variant,Consequence_incomplete_terminal_codon_variant,Consequence_intergenic_variant,Consequence_intron_variant,...,Codons,CADD_PHRED,Amino_acids,SOMATIC,gnomADe_AMR_AF,DISTANCE,gnomADe_FIN_AF,FLAGS,gnomADe_SAS_AF,gnomADe_AFR_AF
snp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22_16849573_A_G,-,-,0,0,0,0,0,0,1,0,...,-,8.747,-,-,,,,-,,
22_16849971_A_T,-,-,0,0,0,0,0,0,1,0,...,-,8.843,-,-,,,,-,,
22_16850437_G_A,-,-,0,0,0,0,0,0,1,0,...,-,8.063,-,-,,,,-,,
22_16851225_C_T,-,-,0,0,0,0,0,0,1,0,...,-,8.324,-,-,,,,-,,
22_16851356_C_T,-,-,0,0,0,0,0,0,1,0,...,-,8.148,-,-,,,,-,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22_51202748_A_G,ENSG00000184319,ENST00000496652,0,0,0,0,0,0,0,1,...,-,9.870,-,-,,,,-,,
22_51208568_G_T,ENSG00000184319,ENST00000496652,0,0,0,0,0,0,0,1,...,-,4.299,-,-,,,,-,,
22_51211031_A_G,ENSG00000079974,ENST00000395593,0,0,0,0,0,0,0,1,...,-,5.127,-,-,,,,-,,
22_51213613_C_T,ENSG00000184319,ENST00000496652,0,0,0,0,0,0,0,1,...,-,0.190,-,-,,,,-,,


In [45]:
dd.G.varm["variant_annotation"].columns

Index(['gene_id', 'transcript_id', 'Consequence_3_prime_UTR_variant',
       'Consequence_5_prime_UTR_variant', 'Consequence_NMD_transcript_variant',
       'Consequence_coding_sequence_variant',
       'Consequence_downstream_gene_variant',
       'Consequence_incomplete_terminal_codon_variant',
       'Consequence_intergenic_variant', 'Consequence_intron_variant',
       'Consequence_mature_miRNA_variant', 'Consequence_missense_variant',
       'Consequence_non_coding_transcript_exon_variant',
       'Consequence_non_coding_transcript_variant',
       'Consequence_splice_acceptor_variant',
       'Consequence_splice_donor_5th_base_variant',
       'Consequence_splice_donor_region_variant',
       'Consequence_splice_donor_variant',
       'Consequence_splice_polypyrimidine_tract_variant',
       'Consequence_splice_region_variant', 'Consequence_start_lost',
       'Consequence_stop_gained', 'Consequence_stop_retained_variant',
       'Consequence_synonymous_variant', 'Consequence_ups

In [46]:
dd.G.uns["variant_annotation_vep"]["CADD_RAW"].describe()

count    178597.000000
mean         -0.000616
std           0.407068
min          -2.286863
25%          -0.202241
50%          -0.040239
75%           0.123338
max           7.595543
Name: CADD_RAW, dtype: float64

In [47]:
dd.G.uns["variant_annotation_vep"]["TSSDistance"].describe()

count    24466.000000
mean      2536.080724
std       1454.770799
min          1.000000
25%       1272.000000
50%       2561.000000
75%       3803.000000
max       5000.000000
Name: TSSDistance, dtype: float64

## Run the burden test

In [48]:
burden_agg_fct = "sum"
run_lrt = True
annotation_cols = ["CADD_RAW", "TSSDistance"]

In [49]:
results = []
if isinstance(dd.G.X, da.Array | ad._core.views.DaskArrayView):
    if dd.G.is_view:
        dd._G = dd._G.copy()  # TODO: discuss with SWEs
    dd.G.X = dd.G.X.compute()

if do_debug:
    warnings.filterwarnings("ignore", category=RuntimeWarning)

for gene, row in tqdm(dd.C.var.iterrows(), total=dd.shape[3]):
    Y = gaussianize(dd.G.obsm[pb_gex_key][[gene]].values + 1e-5 * np.random.randn(dd.shape[0], 1))

    start = max(0, row.start - cis_window)
    end = row.end + cis_window
    _G = dd.G[:, (dd.G.var.pos < end)]
    _G = _G[:, (_G.var.pos > start)]
    _G = _G[:, (_G.X.std(0) != 0)]

    rdf = burden_test(_G, Y, F, gene, annotation_cols=annotation_cols, burden_agg_fct=burden_agg_fct, run_lrt=run_lrt)
    results.append(rdf)

rdf = pd.concat(results)
rdf

100%|██████████| 360/360 [00:11<00:00, 30.13it/s]


Unnamed: 0,burden_gene,egene,burden_type,burden_agg_fct,pv,beta,betaste,lrt
0,ENSG00000100181,ENSG00000100181,CADD_RAW,sum,0.522758,-0.008644,0.013525,0.408449
1,ENSG00000100181,ENSG00000100181,TSSDistance,sum,,,,
0,ENSG00000237438,ENSG00000237438,CADD_RAW,sum,0.711724,-0.001945,0.005265,0.136561
1,ENSG00000237438,ENSG00000237438,TSSDistance,sum,,,,
0,ENSG00000177663,ENSG00000177663,CADD_RAW,sum,0.150213,0.006980,0.004851,2.070087
...,...,...,...,...,...,...,...,...
1,ENSG00000205559,ENSG00000205559,TSSDistance,sum,,,,
0,ENSG00000100299,ENSG00000100299,CADD_RAW,sum,0.106072,-0.006906,0.004273,2.611786
1,ENSG00000100299,ENSG00000100299,TSSDistance,sum,,,,
0,ENSG00000079974,ENSG00000079974,CADD_RAW,sum,0.163640,-0.013136,0.009431,1.940259
