In [1]:
## installing the package
! pip uninstall -q -y cellink
! pip install -q ../../.
! pwd

## importing the libraries
from pathlib import Path
import os
import pandas as pd

import scanpy as sc
import cellink as cl

## defining the path to the data
scdata_path = "/home/lollo/Work/hackathon/data/Yazar_OneK1K/OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"
annotation_path = "/home/lollo/Work/hackathon/data/Yazar_OneK1K/gene_annotation.csv"
gdata_path = "/home/lollo/Work/hackathon/data/Yazar_OneK1K/OneK1K_imputation_post_qc_r2_08/plink/"
dump_path = "/home/lollo/Work/out/dumps-sc-genetics"
## defining target chromosome and cell type
target_chromosome = "1"
target_cell_type = "CD4 ET"

/home/lollo/Work/code/repos/forks/sc-genetics/docs/notebooks


In [2]:
## reading single cell data
scdata = sc.read_h5ad(scdata_path)
## reading annotation data
annotation_df = pd.read_csv(annotation_path).loc[
    :, ["ensembl_gene_id", "start_position", "end_position", "chromosome_name"]
]
annotation_df = annotation_df.loc[annotation_df.ensembl_gene_id.isin(scdata.var_names)]
## merging the scdata.var df with the annotations
merged_df = pd.merge(scdata.var, annotation_df, left_index=True, right_on="ensembl_gene_id")
merged_df = merged_df.rename(
    columns={
        "ensembl_gene_id": "Geneid",
        "start_position": "start",
        "end_position": "end",
        "chromosome_name": "chrom",
    }
)
merged_df.index = merged_df["Geneid"]
scdata.var = merged_df
## loading the genetics data
plink_file = os.path.join(gdata_path, f"chr{target_chromosome}.dose.filtered.R2_0.8")
gdata = cl.io.read_plink(plink_file)
## constructing donor data
data = cl.DonorData(adata=scdata, gdata=gdata, donor_key_in_sc_adata="individual")

[2024-11-20 17:31:40,280] INFO:cellink._core.donordata: Keeping 981/1034 donors
[2024-11-20 17:31:40,281] INFO:cellink._core.donordata: Dropping 53/1034 donors from genetic data
[2024-11-20 17:31:40,281] INFO:cellink._core.donordata: Dropping 0/981 donors from single-cell data


In [3]:
## running eqtl
cl.tl.eqtl(
    data, 
    target_cell_type, 
    target_chromosome,
    n_sc_comps= 300,
    n_genetic_pcs = 300,
    dump_dir = dump_path
)

  reference_data = reference_data.groupby(donor_key_in_scdata).agg(["unique"])
  reference_data = reference_data.groupby(donor_key_in_scdata).agg(["unique"])


[2024-11-20 17:31:51,735] INFO:cellink._core.donordata: Keeping 981/981 donors
[2024-11-20 17:31:51,736] INFO:cellink._core.donordata: Dropping 0/981 donors from genetic data
[2024-11-20 17:31:51,736] INFO:cellink._core.donordata: Dropping 0/981 donors from single-cell data
[2024-11-20 17:31:51,739] INFO:cellink.tl._eqtl: `target_genes` not defined, running the EQTL agains all the 1153 genes for current combination of target_cell_type='CD4 ET', target_chromosome='2'.


  0%|          | 0/1153 [00:00<?, ?it/s]

[2024-11-20 17:31:51,740] INFO:cellink.tl._eqtl: Running EQTL target_cell_type='CD4 ET', target_chromosome='2' target_gene='ENSG00000184731'


  0%|          | 1/1153 [00:02<50:13,  2.62s/it]

[2024-11-20 17:31:54,357] INFO:cellink.tl._eqtl: Running EQTL target_cell_type='CD4 ET', target_chromosome='2' target_gene='ENSG00000035115'


  0%|          | 2/1153 [00:05<50:08,  2.61s/it]

[2024-11-20 17:31:56,970] INFO:cellink.tl._eqtl: Running EQTL target_cell_type='CD4 ET', target_chromosome='2' target_gene='ENSG00000143727'


  0%|          | 3/1153 [00:07<49:27,  2.58s/it]

[2024-11-20 17:31:59,510] INFO:cellink.tl._eqtl: Running EQTL target_cell_type='CD4 ET', target_chromosome='2' target_gene='ENSG00000151353'


  0%|          | 4/1153 [00:10<49:04,  2.56s/it]

[2024-11-20 17:32:02,045] INFO:cellink.tl._eqtl: Running EQTL target_cell_type='CD4 ET', target_chromosome='2' target_gene='ENSG00000233296'


  0%|          | 5/1153 [00:12<49:02,  2.56s/it]

[2024-11-20 17:32:04,609] INFO:cellink.tl._eqtl: Running EQTL target_cell_type='CD4 ET', target_chromosome='2' target_gene='ENSG00000237667'


KeyboardInterrupt: 