In [None]:
import logging
import warnings
import os

import anndata as ad
import scanpy as sc
import numpy as np
import anndata as an
import pandas as pd

from tqdm.notebook import tqdm
from anndata.utils import asarray
from pathlib import Path
from statsmodels.stats.multitest import fdrcorrection

from cellink.io import read_sgkit_zarr
from cellink.tl import get_all_eqtls_on_single_gene

warnings.filterwarnings('ignore')

logger = logging.getLogger(__name__)

In [None]:
DEBUG = False
TARGET_CHROMOSOME = "22"
THRESHOLD = 0.05
## running the eqtl test
CIS_WINDOW = 500_000

In [None]:
## paths
DATA = Path("/home/lollo/Work/hackathon/data/Yazar_OneK1K")
#DATA = Path("/Users/jan.engelmann/projects/sc-eqtl/data")

vcf_file_path = DATA / "OneK1K_imputation_post_qc_r2_08/filter_vcf_r08/chr22.dose.filtered.R2_0.8.vcf.gz"

zarr_path = vcf_file_path.parent.parent / "filter_zarr_r08"
zarr_path.mkdir(exist_ok=True)

icf_file_path = zarr_path / vcf_file_path.with_suffix(".icf").name
zarr_file_path = (zarr_path / vcf_file_path.stem).with_suffix(".vcz")

gene_annotation_path = DATA / "gene_annotation.csv"

if DEBUG:
    scdata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"
    scdata_annotated_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes_filtered_annotated.h5ad"
else:
    scdata_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"
    scdata_annotated_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes_filtered_annotated.h5ad.gz"

dump_path = f"/home/lollo/Work/hackathon/dump/eqtl_results_all_variants.csv"

print(gene_annotation_path, zarr_file_path, scdata_path)

In [None]:
## reading single cell data
scdata = ad.read_h5ad(scdata_path)
## adding column needed to match the annotated data
scdata.var["ensembl_gene_id"] = scdata.var_names
scdata.var

In [None]:
## reading annotation data
annotation_df = pd.read_csv(gene_annotation_path).loc[:, ["ensembl_gene_id", "start_position", "end_position", "chromosome_name"]]
annotation_df = annotation_df.loc[annotation_df.ensembl_gene_id.isin(scdata.var_names)] 
annotation_df

In [None]:
## merging the scdata.var df with the annotations
merged_df = scdata.var.merge(annotation_df, on = "ensembl_gene_id")
merged_df = merged_df.rename(columns = {"ensembl_gene_id": "Geneid", "start_position": "start", "end_position": "end", "chromosome_name": "chrom"})
merged_df.index = merged_df["Geneid"]
merged_df = merged_df.drop(["Geneid"], axis = 1)
scdata.var = merged_df
scdata.var

In [None]:
## reading genetic data
gdata = read_sgkit_zarr(zarr_file_path)
gdata.obs = gdata.obs.set_index("id")
gdata

In [None]:
## constructing result output
results = []
for cell_label in scdata.obs.cell_label.unique():
    ## subsetting the scdata over the current cell_label
    scdata_cell = scdata[scdata.obs.cell_label == cell_label]
    ## normalizing counts
    sc.pp.normalize_total(scdata_cell)
    sc.pp.log1p(scdata_cell)
    sc.pp.normalize_total(scdata_cell)
    scdata_cell = scdata_cell[:, scdata_cell.var["chrom"] == TARGET_CHROMOSOME]
    ## aggregating the data
    pbdata = sc.get.aggregate(scdata_cell, "individual", "mean")
    gdata_cell = gdata[pbdata.obs.index]
    pbdata.X = pbdata.layers["mean"]
    ## sanity check (we have all the individuals from both data sources)
    assert (pbdata.obs.index == gdata_cell.obs.index).all()
    ## filter out genes that are expressed in less than ten individuals
    sc.pp.filter_genes(pbdata, min_cells = 10) 
    ## performing eqtl tests over the genes with chrom 22
    for target_gene in tqdm(pbdata.var_names):
        eqtl_results = get_all_eqtls_on_single_gene(pbdata, gdata_cell, target_gene, CIS_WINDOW)
        if len(eqtl_results) == 0:
            print(f"No matches found for the current combination of gene {target_gene} and {cell_label} using window of {cis_window}")
            eqtl_results = {"target_gene": target_gene, "no_tested_variants": 0}
        eqtl_results = [{'cell_label': cell_label, **row} for row in eqtl_results]
        results += eqtl_results
    ## constructing output DataFrame
    eqtl_results_df = pd.DataFrame(results)
    ## saving the resulting dataframe
    eqtl_results_df.to_csv(dump_path, index = False)
# eqtl_results_df["pv_reject"] = eqtl_results_df["min_pv"] < THRESHOLD
# eqtl_results_df["bf_pv"] = np.clip(eqtl_results_df["min_pv"]*eqtl_results_df["no_tested_variants"], 0, 1)
# eqtl_results_df["bf_pv_reject"] = eqtl_results_df["bf_pv"] < THRESHOLD
# eqtl_results_df["q_val"] = fdrcorrection(eqtl_results_df["bf_pv"].values)[1]
# eqtl_results_df["q_val_reject"] = eqtl_results_df["q_val"] < THRESHOLD
# eqtl_results_df.to_csv(f"/home/lollo/Work/hackathon/dump/eqtl_results.csv", index = False)

## TODOs

- [x] Run on all genes on Chromosome 22
- [x] For each gene store: minimum p value, number of variants tested, id of minimum pv variant, gene name
- [x] Bonferroni correction per hit: pv_gene = pv * num_cis_variants, np clip to (0,1)
- [x] subset to bonferroni sginifcant hits (pv_gene < 0.05)
- [x] benjamini hochberg across tests -> qv
- [x] report # of qv < 0.05
- [ ] check how many hits you have compared to OneK1K
- [x] add gwas to tools
- [x] Figure out how to render several notebooks
- [ ] Stretch goal: all cell types