In [48]:

import logging
from gwas import GWAS
from time import time
from tqdm import tqdm

import anndata as ad
import scipy as sp
import scipy.linalg as la
import scipy.stats as st
import scanpy as sc
import sgkit as sg
import pandas as pd
import pandas_plink as ps
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from anndata.utils import asarray
from sklearn.preprocessing import StandardScaler
from scipy.sparse import issparse
from pathlib import Path
from cellink.io import read_sgkit_zarr

logger = logging.getLogger(__name__)
num_pcs = 30

In [51]:
## paths
DATA = Path("/home/lollo/Work/hackathon/data/Yazar_OneK1K")
DATA = Path("/Users/jan.engelmann/projects/sc-eqtl/data")

vcf_file_path = DATA / "OneK1K_imputation_post_qc_r2_08/filter_vcf_r08/chr22.dose.filtered.R2_0.8.vcf.gz"

zarr_path = vcf_file_path.parent.parent / "filter_zarr_r08"
zarr_path.mkdir(exist_ok=True)

icf_file_path = zarr_path / vcf_file_path.with_suffix(".icf").name
zarr_file_path = (zarr_path / vcf_file_path.stem).with_suffix(".vcz")
print(zarr_file_path) ## we are processing chromosome 22

scdata_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"

/Users/jan.engelmann/projects/sc-eqtl/data/OneK1K_imputation_post_qc_r2_08/filter_zarr_r08/chr22.dose.filtered.R2_0.8.vcz


In [52]:
## reading single cell data
scdata = ad.read_h5ad(scdata_path)
scdata = scdata[scdata.obs.cell_label == "CD4 NC"]
scdata

View of AnnData object with n_obs × n_vars = 463528 × 32738
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age'
    var: 'GeneSymbol', 'features'

In [53]:
gdata = read_sgkit_zarr(zarr_file_path)
gdata.obs = gdata.obs.set_index("id")
gdata


AnnData object with n_obs × n_vars = 1034 × 143083
    var: 'chrom', 'pos', 'a0', 'a1', 'AF', 'ER2', 'maf', 'R2', 'contig', 'id', 'id_mask', 'quality'
    varm: 'filter'

In [54]:
## annotating the single cell data
annot = (
    sc.queries.biomart_annotations(
        "hsapiens",
        ["ensembl_gene_id", "start_position", "end_position", "chromosome_name"],
    )
    .set_index("ensembl_gene_id")
    .drop_duplicates()
)

scdata = scdata[:, scdata.var.index.isin(annot.index)]
scdata.var["chrom"] = annot.loc[scdata.var.index, "chromosome_name"].values
scdata.var["start"] = annot.loc[scdata.var.index, "start_position"].values
scdata.var["end"] = annot.loc[scdata.var.index, "end_position"].values

  scdata.var["chrom"] = annot.loc[scdata.var.index, "chromosome_name"].values


In [55]:
sc.pp.normalize_total(scdata)
sc.pp.log1p(scdata)
sc.pp.normalize_total(scdata)

In [56]:
scdata = scdata[:, scdata.var.chrom == "22"]

In [57]:
pbdata = sc.get.aggregate(scdata, "individual", "mean")
gdata = gdata[pbdata.obs.index]
pbdata

AnnData object with n_obs × n_vars = 981 × 666
    obs: 'individual'
    var: 'GeneSymbol', 'features', 'chrom', 'start', 'end'
    layers: 'mean'

In [58]:
assert (pbdata.obs.index == gdata.obs.index).all()

In [61]:
## defining the target gene
target_gene = 'ENSG00000212939' ## this gene is associated with chromosome 22
cis_window = 1_000_000 # 1 mega base
Y = pbdata[:, [target_gene]].layers["mean"]
Y = asarray(Y)

start = pbdata.var.loc[target_gene].start
end = pbdata.var.loc[target_gene].end
chrom = pbdata.var.loc[target_gene].chrom

subgadata = gdata[:, (gdata.var.chrom == chrom) & (gdata.var.pos >= start - cis_window) & (gdata.var.pos <= end + cis_window)]
G = subgadata.X.compute()


gwas = GWAS(Y)
gwas.process(G)
pv = gwas.getPv()
pv[np.isnan(pv)] = 1
pv.min()


  n = 1.0 / (GG - np.einsum("ij,ij->j", FG, A0iFG))
  M = -n * A0iFG
  self.beta_g += n[:, None] * GY


2.200330284969727e-130