# Pseudobulk eQTL Analysis

In [1]:
import gc
import warnings
from pathlib import Path

import anndata as ad

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=FutureWarning)
    import scanpy as sc

import dask.array as da
import numpy as np
import pandas as pd
from statsmodels.stats.multitest import fdrcorrection
from tqdm.auto import tqdm

import cellink as cl
from cellink._core import DAnn, GAnn
from cellink.tl.gwas import GWAS
from cellink.utils import column_normalize, gaussianize

In [2]:
DATA = Path(cl.__file__).parent.parent.parent / "data"

gpc_path = DATA / "OneK1K_imputation_post_qc_r2_08/pcdir/wgs.dose.filtered.R2_0.8.filtered.pruned.eigenvec"
adata_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"
gdata_path = DATA / "OneK1K_imputation_post_qc_r2_08/filter_zarr_r08/chr22.dose.filtered.R2_0.8.vcz"

In [3]:
n_gpcs = 20
n_epcs = 15
batch_e_pcs_n_top_genes = 2000
chrom = 22
cis_window = 500_000
cell_type = "CD4 ET"
pb_gex_key = f"PB_{cell_type}"  # pseudobulk expression in dd.G.obsm[key_added]
original_donor_col = "individual"
min_percent_donors_expressed = 0.1
celltype_key = "cell_label"
do_debug = True

In [4]:
if do_debug:
    adata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"

adata = ad.read_h5ad(adata_path)
gdata = cl.io.read_sgkit_zarr(gdata_path)

gene_ann = pd.read_csv(DATA / "gene_annotation.csv").set_index("ensembl_gene_id")
adata.var = pd.concat([adata.var, gene_ann.loc[adata.var.index]], axis=1).rename(
    columns={
        "start_position": GAnn.start,
        "end_position": GAnn.end,
        "chromosome_name": GAnn.chrom,
    }
)
adata.obs[DAnn.donor] = adata.obs[original_donor_col]
adata

AnnData object with n_obs × n_vars = 25908 × 32738
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age', 'donor_id'
    var: 'GeneSymbol', 'features', 'start', 'end', 'chrom', 'strand', 'description', 'wikigene_name', 'wikigene_id'

In [5]:
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.normalize_total(adata)

mdata = sc.get.aggregate(adata, by=DAnn.donor, func="mean")
mdata.X = mdata.layers.pop("mean")

sc.pp.highly_variable_genes(mdata, n_top_genes=batch_e_pcs_n_top_genes)
sc.tl.pca(mdata, n_comps=n_epcs)

In [6]:
mdata

AnnData object with n_obs × n_vars = 20 × 32738
    obs: 'donor_id'
    var: 'GeneSymbol', 'features', 'start', 'end', 'chrom', 'strand', 'description', 'wikigene_name', 'wikigene_id', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'hvg', 'pca'
    obsm: 'X_pca'
    varm: 'PCs'

In [7]:
adata = adata[adata.obs[celltype_key] == cell_type].copy()
gc.collect()

338

In [8]:
dd = cl.DonorData(G=gdata, C=adata).copy()
dd



In [9]:
dd.aggregate(key_added=pb_gex_key, filter_key=celltype_key, filter_value=cell_type, sync_var=True)
dd.aggregate(obs=["sex", "age"], func="first", add_to_obs=True)
dd



In [10]:
dd.G.obsm["PB_CD4 ET"].shape

(20, 32738)

In [11]:
gpcs = pd.read_csv(gpc_path, sep=r"\s+", index_col=1, header=None).drop(columns=0)
dd.G.obsm["gPCs"] = gpcs.loc[dd.G.obs.index].iloc[:, :n_gpcs]

In [12]:
print(dd.G.obsm[pb_gex_key].shape)
print(dd.shape)

keep_genes = ((dd.G.obsm[pb_gex_key] > 0).mean(axis=0) >= min_percent_donors_expressed).values
dd = dd[..., keep_genes]
# dd[C_obs=keep_donors, :, C_obs=keep_cells, C_var=keep_genes]

print(dd.G.obsm[pb_gex_key].shape)
print(dd.shape)

(20, 32738)
(20, 143083, 1332, 32738)


IndexError: Boolean index does not match AnnData’s shape along this dimension. Boolean index has shape (32738,) while AnnData index has shape (143083,).

In [15]:
dd.G.obsm["ePCs"] = mdata[dd.G.obs_names].obsm["X_pca"]

In [16]:
F = np.concatenate(
    [
        np.ones((dd.shape[0], 1)),
        dd.G.obs[["sex"]].values - 1,
        dd.G.obs[["age"]].values,
        dd.G.obsm["gPCs"].values,
        dd.G.obsm["ePCs"],
    ],
    axis=1,
)
F[:, 2:] = column_normalize(F[:, 2:])

In [17]:
dd = dd[:, dd.G.var.chrom == str(chrom), :, dd.C.var.chrom == str(chrom)]
dd

  _D.obsm[key] = _D.obsm[key].loc[:, _C.var.index]




In [18]:
results = []
if isinstance(dd.G.X, da.Array | ad._core.views.DaskArrayView):
    if dd.G.is_view:
        dd._G = dd._G.copy()  # TODO: discuss with SWEs
    dd.G.X = dd.G.X.compute()


for gene, row in tqdm(dd.C.var.iterrows(), total=dd.shape[3]):
    Y = gaussianize(dd.G.obsm[pb_gex_key][[gene]].values + 1e-5 * np.random.randn(dd.shape[0], 1))

    start = max(0, row.start - cis_window)
    end = row.end + cis_window
    _G = dd.G[:, (dd.G.var.pos < end)]
    _G = _G[:, (_G.var.pos > start)]
    _G = _G[:, (_G.X.std(0) != 0)]
    G = _G.X

    gwas = GWAS(Y, F)
    gwas.process(G)

    snp_idx = gwas.getPv().argmin()

    def _get_top_snp(arr, snp_idx=snp_idx):
        return arr.ravel()[snp_idx].item()

    rdict = {
        "snp": _G.var.iloc[snp_idx].name,
        "egene": gene,
        "n_cis_snps": G.shape[1],
        "pv": _get_top_snp(gwas.getPv()),
        "beta": _get_top_snp(gwas.getBetaSNP()),
        "betaste": _get_top_snp(gwas.getBetaSNPste()),
        "lrt": _get_top_snp(gwas.getLRT()),
    }
    results.append(rdict)

rdf = pd.DataFrame(results)
rdf

  0%|          | 0/320 [00:00<?, ?it/s]

Unnamed: 0,snp,egene,n_cis_snps,pv,beta,betaste,lrt
0,22_17274624_G_A,ENSG00000100181,1688,5.791056e-04,0.641658,0.186462,11.841969
1,22_17435280_G_A,ENSG00000177663,3188,2.904333e-04,0.462711,0.127691,13.131106
2,22_18104924_A_G,ENSG00000069998,3378,1.294364e-03,3.192388,0.992279,10.350546
3,22_17860359_C_T,ENSG00000093072,3567,1.397507e-03,0.498323,0.155962,10.209082
4,22_18077593_G_A,ENSG00000131100,4579,1.063775e-03,-3.243186,0.990863,10.713138
...,...,...,...,...,...,...,...
315,22_50633993_G_A,ENSG00000205560,3415,1.246329e-03,-1.108288,0.343330,10.420370
316,22_50957733_C_T,ENSG00000100288,3337,1.087430e-03,0.657518,0.201268,10.672450
317,22_50978217_C_T,ENSG00000205559,3317,8.843337e-04,0.770082,0.231607,11.055314
318,22_51064416_T_C,ENSG00000100299,3068,8.601285e-08,-0.374713,0.069987,28.665721


In [19]:
rdf["pv_adj"] = np.clip(rdf["pv"] * rdf["n_cis_snps"], 0, 1)  # gene-wise Bonferroni
rdf["qv"] = fdrcorrection(rdf["pv_adj"])[1]

(rdf.qv < 0.05).sum()

24