In [1]:
import logging
import warnings

import anndata as ad
import scanpy as sc
import pandas as pd

from tqdm.notebook import tqdm
from pathlib import Path

from cellink.io import read_sgkit_zarr
from cellink.tl import get_best_eqtl_on_single_gene

warnings.filterwarnings("ignore")

logger = logging.getLogger(__name__)

In [2]:
DEBUG = False
TARGET_CHROMOSOME = "22"
THRESHOLD = 0.05
## running the eqtl test
CIS_WINDOW = 1_000_000

In [3]:
## paths
DATA = Path("/home/lollo/Work/hackathon/data/Yazar_OneK1K")
# DATA = Path("/Users/jan.engelmann/projects/sc-eqtl/data")

vcf_file_path = DATA / "OneK1K_imputation_post_qc_r2_08/filter_vcf_r08/chr22.dose.filtered.R2_0.8.vcf.gz"

zarr_path = vcf_file_path.parent.parent / "filter_zarr_r08"
zarr_path.mkdir(exist_ok=True)

icf_file_path = zarr_path / vcf_file_path.with_suffix(".icf").name
zarr_file_path = (zarr_path / vcf_file_path.stem).with_suffix(".vcz")

gene_annotation_path = DATA / "gene_annotation.csv"

if DEBUG:
    scdata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"
    scdata_annotated_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes_filtered_annotated.h5ad"
else:
    scdata_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"
    scdata_annotated_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes_filtered_annotated.h5ad.gz"

dump_path = "/home/lollo/Work/hackathon/dump/eqtl_results.csv"

print(gene_annotation_path, zarr_file_path, scdata_path)

/home/lollo/Work/hackathon/data/Yazar_OneK1K/gene_annotation.csv /home/lollo/Work/hackathon/data/Yazar_OneK1K/OneK1K_imputation_post_qc_r2_08/filter_zarr_r08/chr22.dose.filtered.R2_0.8.vcz /home/lollo/Work/hackathon/data/Yazar_OneK1K/OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz


In [4]:
## reading single cell data
scdata = ad.read_h5ad(scdata_path)
## adding column needed to match the annotated data
scdata.var["ensembl_gene_id"] = scdata.var_names
scdata.var

Unnamed: 0_level_0,GeneSymbol,features,ensembl_gene_id
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000243485,MIR1302-10,MIR1302-10,ENSG00000243485
ENSG00000237613,FAM138A,FAM138A,ENSG00000237613
ENSG00000186092,OR4F5,OR4F5,ENSG00000186092
ENSG00000238009,RP11-34P13.7,RP11-34P13.7,ENSG00000238009
ENSG00000239945,RP11-34P13.8,RP11-34P13.8,ENSG00000239945
...,...,...,...
ENSG00000215635,AC145205.1,AC145205.1,ENSG00000215635
ENSG00000268590,BAGE5,BAGE5,ENSG00000268590
ENSG00000251180,CU459201.1,CU459201.1,ENSG00000251180
ENSG00000215616,AC002321.2,AC002321.2,ENSG00000215616


In [5]:
## reading annotation data
annotation_df = pd.read_csv(gene_annotation_path).loc[
    :, ["ensembl_gene_id", "start_position", "end_position", "chromosome_name"]
]
annotation_df = annotation_df.loc[annotation_df.ensembl_gene_id.isin(scdata.var_names)]
annotation_df

Unnamed: 0,ensembl_gene_id,start_position,end_position,chromosome_name
4,ENSG00000229483,23743974,23744736,13
7,ENSG00000232849,93708910,93710179,13
11,ENSG00000229558,23993110,24002818,13
13,ENSG00000224394,94470677,94488397,13
15,ENSG00000232977,24040710,24061603,13
...,...,...,...,...
61264,ENSG00000228242,14186223,14189784,3
61265,ENSG00000150867,22823778,23003484,10
61266,ENSG00000255021,14313873,14345345,3
61267,ENSG00000251576,14389951,14394068,3


In [6]:
## merging the scdata.var df with the annotations
merged_df = scdata.var.merge(annotation_df, on="ensembl_gene_id")
merged_df = merged_df.rename(
    columns={"ensembl_gene_id": "Geneid", "start_position": "start", "end_position": "end", "chromosome_name": "chrom"}
)
merged_df.index = merged_df["Geneid"]
merged_df = merged_df.drop(["Geneid"], axis=1)
scdata.var = merged_df
scdata.var

Unnamed: 0_level_0,GeneSymbol,features,start,end,chrom
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000243485,MIR1302-10,MIR1302-10,29554,31109,1
ENSG00000237613,FAM138A,FAM138A,34554,36081,1
ENSG00000186092,OR4F5,OR4F5,69091,70008,1
ENSG00000238009,RP11-34P13.7,RP11-34P13.7,89295,133566,1
ENSG00000239945,RP11-34P13.8,RP11-34P13.8,89551,91105,1
...,...,...,...,...,...
ENSG00000215635,AC145205.1,AC145205.1,44537,47211,GL000204.1
ENSG00000268590,BAGE5,BAGE5,899,2487,GL000237.1
ENSG00000251180,CU459201.1,CU459201.1,12836,34543,GL000242.1
ENSG00000215616,AC002321.2,AC002321.2,3905,22432,GL000201.1


In [7]:
## reading genetic data
gdata = read_sgkit_zarr(zarr_file_path)
gdata.obs = gdata.obs.set_index("id")
gdata

AnnData object with n_obs × n_vars = 1034 × 143083
    var: 'chrom', 'pos', 'a0', 'a1', 'AF', 'ER2', 'maf', 'R2', 'contig', 'id', 'id_mask', 'quality'
    varm: 'filter'

In [8]:
cells_to_analyse = ["CD4 ET", "CD4 NC", "B IN", "B Mem", "Plasma"]

In [9]:
## constructing result output
results = []
# for cell_label in scdata.obs.cell_label.unique():
for cell_label in cells_to_analyse:
    ## subsetting the scdata over the current cell_label
    scdata_cell = scdata[scdata.obs.cell_label == cell_label]
    ## normalizing counts
    sc.pp.normalize_total(scdata_cell)
    sc.pp.log1p(scdata_cell)
    sc.pp.normalize_total(scdata_cell)
    scdata_cell = scdata_cell[:, scdata_cell.var["chrom"] == TARGET_CHROMOSOME]
    ## aggregating the data
    pbdata = sc.get.aggregate(scdata_cell, "individual", "mean")
    gdata_cell = gdata[pbdata.obs.index]
    pbdata.X = pbdata.layers["mean"]
    ## sanity check (we have all the individuals from both data sources)
    assert (pbdata.obs.index == gdata_cell.obs.index).all()
    ## filter out genes that are expressed in less than ten individuals
    sc.pp.filter_genes(pbdata, min_cells=10)
    ## performing eqtl tests over the genes with chrom 22
    for target_gene in tqdm(pbdata.var_names):
        eqtl_results = get_best_eqtl_on_single_gene(pbdata, gdata_cell, target_gene, CIS_WINDOW)
        if len(eqtl_results) == 0:
            print(
                f"No matches found for the current combination of gene {target_gene} and {cell_label} using window of {cis_window}"
            )
            eqtl_results = {"target_gene": target_gene, "no_tested_variants": 0}
        eqtl_results["cell_label"] = cell_label
        results.append(eqtl_results)
    ## constructing output DataFrame
    eqtl_results_df = pd.DataFrame(results)
    ## saving the resulting dataframe
    eqtl_results_df.to_csv(dump_path, index=False)
# eqtl_results_df["pv_reject"] = eqtl_results_df["min_pv"] < THRESHOLD
# eqtl_results_df["bf_pv"] = np.clip(eqtl_results_df["min_pv"]*eqtl_results_df["no_tested_variants"], 0, 1)
# eqtl_results_df["bf_pv_reject"] = eqtl_results_df["bf_pv"] < THRESHOLD
# eqtl_results_df["q_val"] = fdrcorrection(eqtl_results_df["bf_pv"].values)[1]
# eqtl_results_df["q_val_reject"] = eqtl_results_df["q_val"] < THRESHOLD
# eqtl_results_df.to_csv(f"/home/lollo/Work/hackathon/dump/eqtl_results.csv", index = False)

  0%|          | 0/440 [00:00<?, ?it/s]

  0%|          | 0/515 [00:00<?, ?it/s]

  0%|          | 0/455 [00:00<?, ?it/s]

  0%|          | 0/426 [00:00<?, ?it/s]

  0%|          | 0/350 [00:00<?, ?it/s]

## TODOs

- [x] Run on all genes on Chromosome 22
- [x] For each gene store: minimum p value, number of variants tested, id of minimum pv variant, gene name
- [x] Bonferroni correction per hit: pv_gene = pv * num_cis_variants, np clip to (0,1)
- [x] subset to bonferroni sginifcant hits (pv_gene < 0.05)
- [x] benjamini hochberg across tests -> qv
- [x] report # of qv < 0.05
- [ ] check how many hits you have compared to OneK1K
- [x] add gwas to tools
- [x] Figure out how to render several notebooks
- [ ] Stretch goal: all cell types