In [1]:
import gc
from pathlib import Path

import anndata as ad
import dask.array as da
import numpy as np
import pandas as pd
import scanpy as sc
from statsmodels.stats.multitest import fdrcorrection
from tqdm.auto import tqdm
from utils import GWAS, column_normalize, gaussianize

import cellink as cl
from cellink._core import DAnn, GAnn



In [2]:
DATA = Path(cl.__file__).parent.parent.parent / "data"

gpc_path = DATA / "OneK1K_imputation_post_qc_r2_08/pcdir/wgs.dose.filtered.R2_0.8.filtered.pruned.eigenvec"
adata_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"
gdata_path = DATA / "OneK1K_imputation_post_qc_r2_08/filter_zarr_r08/chr22.dose.filtered.R2_0.8.vcz"

In [3]:
n_gpcs = 20
n_epcs = 15
chrom = 22
cis_window = 500_000
cell_type = "CD4 ET"
pb_gex_key = f"PB_{cell_type}"  # pseudobulk expression in dd.D.obsm[key_added]
original_donor_col = "individual"
batch_e_pcs_n_top_genes = 2000
min_percent_donors_expressed = 0.1
celltype_key = "cell_label"
do_debug = False

In [4]:
if do_debug:
    adata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"

adata = ad.read_h5ad(adata_path)
gdata = cl.io.read_sgkit_zarr(gdata_path)

gene_ann = pd.read_csv(DATA / "gene_annotation.csv").set_index("ensembl_gene_id")
adata.var = pd.concat([adata.var, gene_ann.loc[adata.var.index]], axis=1).rename(
    columns={
        "start_position": GAnn.start,
        "end_position": GAnn.end,
        "chromosome_name": GAnn.chrom,
    }
)
adata.obs[DAnn.donor] = adata.obs[original_donor_col]
adata

AnnData object with n_obs × n_vars = 1272489 × 32738
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age', 'donor_id'
    var: 'GeneSymbol', 'features', 'start', 'end', 'chrom', 'strand', 'description', 'wikigene_name', 'wikigene_id'

In [5]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.normalize_total(adata)

mdata = sc.get.aggregate(adata, by=DAnn.donor, func="mean")
mdata.X = mdata.layers.pop("mean")

sc.pp.highly_variable_genes(mdata, n_top_genes=batch_e_pcs_n_top_genes)
sc.tl.pca(mdata, n_comps=n_epcs)

In [6]:
mdata

AnnData object with n_obs × n_vars = 981 × 32738
    obs: 'donor_id'
    var: 'GeneSymbol', 'features', 'start', 'end', 'chrom', 'strand', 'description', 'wikigene_name', 'wikigene_id', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [7]:
adata = adata[adata.obs[celltype_key] == cell_type].copy()
gc.collect()

439

In [8]:
dd = cl.DonorData(C=adata, D=gdata).copy()
dd



In [9]:
dd.aggregate(key_added=pb_gex_key, filter_key=celltype_key, filter_value=cell_type, sync_var=True)
dd.aggregate(obs=["sex", "age"], func="first", add_to_obs=True)
dd



In [10]:
gpcs = pd.read_csv(gpc_path, sep=r"\s+", index_col=1, header=None).drop(columns=0)
dd.D.obsm["gPCs"] = gpcs.loc[dd.D.obs.index].iloc[:, :n_gpcs]

In [11]:
print(dd.D.obsm[pb_gex_key].shape)
print(dd.shape)

keep_genes = ((dd.D.obsm[pb_gex_key] > 0).mean(axis=0) >= min_percent_donors_expressed).values
dd = dd[:, :, :, keep_genes]

print(dd.D.obsm[pb_gex_key].shape)
print(dd.shape)

(981, 32738)
(981, 143083, 61786, 32738)
(981, 12582)
(981, 143083, 61786, 12582)


  _D.obsm[key] = _D.obsm[key].loc[:, _C.var.index]


In [12]:
dd.D.obsm["ePCs"] = mdata[dd.D.obs_names].obsm["X_pca"]

In [13]:
F = np.concatenate(
    [
        np.ones((dd.shape[0], 1)),
        dd.D.obs[["sex"]].values - 1,
        dd.D.obs[["age"]].values,
        dd.D.obsm["gPCs"].values,
        dd.D.obsm["ePCs"],
    ],
    axis=1,
)
F[:, 2:] = column_normalize(F[:, 2:])

In [14]:
dd = dd[:, dd.D.var.chrom == str(chrom), :, dd.C.var.chrom == str(chrom)]
dd

  _D.obsm[key] = _D.obsm[key].loc[:, _C.var.index]




In [15]:
results = []
if isinstance(dd.D.X, da.Array | ad._core.views.DaskArrayView):
    if dd.D.is_view:
        dd._D = dd._D.copy()
    dd.D.X = dd.D.X.compute()


for gene, row in tqdm(dd.C.var.iterrows(), total=dd.shape[3]):
    Y = gaussianize(dd.D.obsm[pb_gex_key][[gene]].values + 1e-5 * np.random.randn(dd.shape[0], 1))

    start = max(0, row.start - cis_window)
    end = row.end + cis_window
    _D = dd.D[:, (dd.D.var.pos < end)]
    _D = _D[:, (_D.var.pos > start)]
    _D = _D[:, (_D.X.std(0) != 0)]
    G = _D.X

    gwas = GWAS(Y, F)
    gwas.process(G)

    snp_idx = gwas.getPv().argmin()

    def _get_top_snp(arr, snp_idx=snp_idx):
        return arr.ravel()[snp_idx].item()

    rdict = {
        "snp": _D.var.iloc[snp_idx].name,
        "egene": gene,
        "n_cis_snps": G.shape[1],
        "pv": _get_top_snp(gwas.getPv()),
        "beta": _get_top_snp(gwas.getBetaSNP()),
        "betaste": _get_top_snp(gwas.getBetaSNPste()),
        "lrt": _get_top_snp(gwas.getLRT()),
    }
    results.append(rdict)

rdf = pd.DataFrame(results)
rdf

  0%|          | 0/320 [00:00<?, ?it/s]

Unnamed: 0,snp,egene,n_cis_snps,pv,beta,betaste,lrt
0,22_17274624_G_A,ENSG00000100181,1688,5.248067e-04,0.639690,0.184468,12.025392
1,22_17435280_G_A,ENSG00000177663,3188,2.208257e-04,0.469891,0.127207,13.645024
2,22_18104924_A_G,ENSG00000069998,3378,1.307317e-03,3.191381,0.992848,10.332166
3,22_17860359_C_T,ENSG00000093072,3567,1.648999e-03,0.490301,0.155795,9.904224
4,22_18161979_G_A,ENSG00000131100,4579,1.441427e-03,0.197257,0.061909,10.152027
...,...,...,...,...,...,...,...
315,22_50896385_A_G,ENSG00000205560,3415,1.367158e-03,0.151194,0.047226,10.249578
316,22_50957733_C_T,ENSG00000100288,3337,1.897226e-03,0.625450,0.201377,9.646404
317,22_51066990_C_T,ENSG00000205559,3317,1.795486e-05,0.232390,0.054184,18.394786
318,22_51064416_T_C,ENSG00000100299,3068,4.209389e-07,-0.355121,0.070193,25.595872


In [16]:
rdf["pv_adj"] = np.clip(rdf["pv"] * rdf["n_cis_snps"], 0, 1)  # gene-wise Bonferroni
rdf["qv"] = fdrcorrection(rdf["pv_adj"])[1]

(rdf.qv < 0.05).sum()

23