### author by yangshichen
### 注意：脚本仅供参考，使用前请仔细阅读

In [1]:
import os
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import seaborn as sb
import seaborn as sns
from matplotlib.pyplot import rc_context
import matplotlib.pyplot as plt
from scipy.io import mmread
from scipy.sparse import csr_matrix
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
from scipy.sparse import issparse
import decoupler as dc
from sklearn.preprocessing import QuantileTransformer
from multiprocessing import Pool, cpu_count
from functools import partial
from sklearn.preprocessing import QuantileTransformer, StandardScaler
quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=0)
scaler = StandardScaler()

import warnings
warnings.filterwarnings("ignore")

In [2]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=100, frameon=False)
sc._settings.ScanpyConfig.n_jobs=70

scanpy==1.9.8 anndata==0.9.2 umap==0.5.7 numpy==1.24.4 scipy==1.10.1 pandas==2.0.3 scikit-learn==1.3.2 statsmodels==0.14.1 pynndescent==0.5.13


In [3]:
import os
os.environ["R_HOME"] = "/home/yangshichen//mambaforge/envs/QTL/lib/R"
os.environ["R_LIBS_USER"] = "/home/yangshichen/mambaforge/envs/QTL/lib/R/library"
import pandas as pd
import torch
import tensorqtl
from tensorqtl import  cis
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"torch: {torch.__version__} (CUDA {torch.version.cuda}), device: {device}")
print(f"pandas: {pd.__version__}")

torch: 2.4.1+cu121 (CUDA 12.1), device: cuda
pandas: 2.0.3


### cis-eQTL mapping: interactions（disease status影响SNP对gene的表达）

In [4]:
#Read genotype_df and vatiant_df
genotype_df = pd.read_parquet('/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/01.Data/01.Genotype/142_sample_genotype.parquet')
variant_df = pd.read_parquet('/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/01.Data/01.Genotype/142_sample_variant_df.parquet')

In [10]:
#设置bed文件所在的文件夹路径
bed_dir = '/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/01.Data/02.scRNA-seq/01.pseudobulk/06.gene_expression_bed/'
output_dir = '/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/03.Result/eQTL/'

#获取所有以.bed结尾的文件
bed_files = [f for f in os.listdir(bed_dir) if f.endswith('.bed')]
bed_files

['CD8_Naive_T-CCR7.bed',
 'ncMono-FCGR3A.bed',
 'Aptypical_Memory_B-ITGAX.bed',
 'CD4_Naive_T-CCR7.bed',
 'CD4_Naive_T-SOX4.bed',
 'CD4_Tcm-CXCR5.bed',
 'CD4_Tcm-GPR183.bed',
 'CD4_Tcm-IFIT3.bed',
 'CD4_Tcm-SOX4.bed',
 'CD4_Tem-CCR7neg.bed',
 'CD4_Tfh_like-CXCR5.bed',
 'CD4_Th-TNFRSF11A.bed',
 'CD4_Th1-GZMK.bed',
 'CD4_Th17-RORC.bed',
 'CD4_Th22-CCR10.bed',
 'CD4_Treg-FCRL3.bed',
 'CD4_Treg-FOXP3.bed',
 'CD8_CTL-GZMB.bed',
 'CD8_CTL-GZMK.bed',
 'ncMono-IFI44L.bed',
 'ncMono-IFIT1.bed',
 'NKT-IFIT3.bed',
 'NKT-NCR1.bed',
 'NK_bright-XCL1.bed',
 'pDC-LILRA4.bed',
 'Plasma_B-IGHA1.bed',
 'Switched_Memory_B-CD27.bed',
 'Switched_Memory_B-CD86.bed',
 'Switched_Memory_B-IGHE.bed',
 'Terminal_NK_dim-CD160neg.bed',
 'Transitional_B-NEIL1.bed',
 'Transitional_NK-GZMK.bed',
 'Unswitched_Memory_B-CD1C.bed',
 'Unswitched_Memory_B-IFIT3.bed',
 'Unswitched_Memory_B-JAM3.bed',
 'CD8_Tcm-GPR183.bed',
 'CD8_Tcm-GZMK.bed',
 'CD8_Tcm-IFI44L.bed',
 'CD8_Tem-GZMK.bed',
 'cDC2-CD1C.bed',
 'cMono-CD14.bed',


In [11]:
# 遍历文件，处理并创建对应文件夹
for bed_file in bed_files:
    try:
        cell_name = os.path.splitext(bed_file)[0]  # 去掉.bed后缀
        folder_path = os.path.join(output_dir, cell_name)

        # read_expression_file and covariates
        expression_bed = f'/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/01.Data/02.scRNA-seq/01.pseudobulk/06.gene_expression_bed/{cell_name}.bed'
        covariates_file = f'/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/01.Data/02.scRNA-seq/01.pseudobulk/05.PEER/01.factor/{cell_name}.csv'

        # load phenotypes and covariates
        phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed)
        covariates_df = pd.read_csv(covariates_file, sep=',', index_col=0)
        phenotype_df = phenotype_df[covariates_df.index]

        # cis-QTL mapping with interaction
        interaction_df = covariates_df[['stage']].copy()
        interaction_df['stage'] = interaction_df['stage'].replace({1: 0, 2: 1})  # HDs和IRs合并
        covariates_df_new = covariates_df.drop(columns=['stage'])

        os.makedirs(f'{folder_path}/diseases_2groups/', exist_ok=True)
        cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, 'diseases_2groups',
                        covariates_df=covariates_df_new, interaction_df=interaction_df,
                        maf_threshold_interaction=0.1, run_eigenmt=True,
                        output_dir=f'{folder_path}/diseases_2groups/', write_top=True, write_stats=True)
    except Exception as e:
        print(f"跳过 {bed_file}，错误信息: {e}")
        continue

cis-QTL mapping: nominal associations for all variant-phenotype pairs
  * 142 samples
  * 11197 phenotypes
  * 19 covariates
  * 7540937 variants
  * including 1 interaction term(s)
    * using 0.1 MAF threshold
  * cis-window: ±1,000,000
  * checking phenotypes: 11197/11197
    ** dropping 9 phenotypes without variants in cis-window
  * Computing associations
    Mapping chromosome chr1
    processing phenotype 1174/11188    time elapsed: 1.38 min
    * writing output
    Mapping chromosome chr2
    processing phenotype 1939/11188    time elapsed: 2.40 min
    * writing output
    Mapping chromosome chr3
    processing phenotype 2583/11188    time elapsed: 3.21 min
    * writing output
    Mapping chromosome chr4
    processing phenotype 2979/11188    time elapsed: 3.79 min
    * writing output
    Mapping chromosome chr5
    processing phenotype 3476/11188    time elapsed: 4.46 min
    * writing output
    Mapping chromosome chr6
    processing phenotype 4059/11188    time elapsed: 5

### cis-caQTL mapping: interactions（disease status影响SNP对peak的开放）

In [4]:
#Read genotype_df and vatiant_df
genotype_df = pd.read_parquet('/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/01.Data/01.Genotype/142_sample_genotype.parquet')
variant_df = pd.read_parquet('/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/01.Data/01.Genotype/142_sample_variant_df.parquet')

In [5]:
#设置bed文件所在的文件夹路径
bed_dir = '/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/01.Data/03.scATAC-seq/01.pseudobulk/06.peak_accessibility_bed/'
output_dir = '/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/03.Result/caQTL/'

#获取所有以.bed结尾的文件
bed_files = [f for f in os.listdir(bed_dir) if f.endswith('.bed')]

In [6]:
#Aptypical_Memory_B-ITGAX
bed_files

['cMono-IFI44L.bed',
 'Aptypical_Memory_B-ITGAX.bed',
 'CD4_Naive_T-CCR7.bed',
 'CD4_Tcm-CXCR5.bed',
 'CD4_Tcm-GPR183.bed',
 'CD4_Tem-CCR7neg.bed',
 'CD4_Tfh_like-CXCR5.bed',
 'CD4_Th-TNFRSF11A.bed',
 'CD4_Th1-GZMK.bed',
 'CD4_Th17-RORC.bed',
 'CD4_Th22-CCR10.bed',
 'CD4_Treg-FCRL3.bed',
 'CD4_Treg-FOXP3.bed',
 'CD8_CTL-GZMB.bed',
 'CD8_CTL-GZMK.bed',
 'CD8_Naive_T-CCR7.bed',
 'CD8_Tcm-GZMK.bed',
 'CD8_Tem-GZMK.bed',
 'cDC2-CD1C.bed',
 'cMono-CD14.bed',
 'gdT2-GZMH.bed',
 'MAIT-SLC4A10.bed',
 'Mature_NK_dim-FCGR3A.bed',
 'Naive_B-TCL1A.bed',
 'ncMono-FCGR3A.bed',
 'ncMono-IFI44L.bed',
 'ncMono-IFIT1.bed',
 'NKT-NCR1.bed',
 'NK_bright-XCL1.bed',
 'pDC-LILRA4.bed',
 'Plasma_B-IGHA1.bed',
 'Switched_Memory_B-CD27.bed',
 'Switched_Memory_B-CD86.bed',
 'Switched_Memory_B-IGHE.bed',
 'Unswitched_Memory_B-CD1C.bed']

In [7]:
# 遍历文件，处理并创建对应文件夹
for bed_file in bed_files:
    try:
        cell_name = os.path.splitext(bed_file)[0]  # 去掉.bed后缀
        folder_path = os.path.join(output_dir, cell_name)

        # read_expression_file and covariates
        expression_bed = f'/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/01.Data/03.scATAC-seq/01.pseudobulk/06.peak_accessibility_bed/{cell_name}.bed'
        covariates_file = f'/media/AnalysisDisk2/Yangshichen/0_HIV_RNA/QTL/01.Dynamic/01.Data/03.scATAC-seq/01.pseudobulk/05.PEER/01.factor/{cell_name}.csv'

        # load phenotypes and covariates
        phenotype_df, phenotype_pos_df = tensorqtl.read_phenotype_bed(expression_bed)
        covariates_df = pd.read_csv(covariates_file, sep=',', index_col=0)
        phenotype_df = phenotype_df[covariates_df.index]

        # cis-QTL mapping with interaction
        interaction_df = covariates_df[['stage']].copy()
        interaction_df['stage'] = interaction_df['stage'].replace({1: 0, 2: 1})  # HDs和IRs合并
        covariates_df_new = covariates_df.drop(columns=['stage'])

        os.makedirs(f'{folder_path}/diseases_2groups/', exist_ok=True)
        cis.map_nominal(genotype_df, variant_df, phenotype_df, phenotype_pos_df, 'diseases_2groups',
                        covariates_df=covariates_df_new, interaction_df=interaction_df,
                        maf_threshold_interaction=0.1, run_eigenmt=True,
                        output_dir=f'{folder_path}/diseases_2groups/', write_top=True, write_stats=True)
    except Exception as e:
        print(f"跳过 {bed_file}，错误信息: {e}")
        continue

cis-QTL mapping: nominal associations for all variant-phenotype pairs
  * 142 samples
  * 44003 phenotypes
  * 19 covariates
  * 7540937 variants
  * including 1 interaction term(s)
    * using 0.1 MAF threshold
  * cis-window: ±1,000,000
  * checking phenotypes: 44003/44003
    ** dropping 27 phenotypes without variants in cis-window
  * Computing associations
    Mapping chromosome chr1
    processing phenotype 4316/43976    time elapsed: 13.44 min
    * writing output
    Mapping chromosome chr2
    processing phenotype 7698/43976    time elapsed: 24.94 min
    * writing output
    Mapping chromosome chr3
    processing phenotype 10556/43976    time elapsed: 35.27 min
    * writing output
    Mapping chromosome chr4
    processing phenotype 12309/43976    time elapsed: 42.80 min
    * writing output
    Mapping chromosome chr5
    processing phenotype 14520/43976    time elapsed: 51.33 min
    * writing output
    Mapping chromosome chr6
    processing phenotype 17062/43976    time 