In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import pychromvar
import gc

from scipy.stats import spearmanr, pearsonr
from scipy.stats import norm
import scipy.sparse as sp


In [None]:

#### 1.导入paired cell的数据
paired_adata = sc.read_h5ad("/home/share/huadjyin/home/zhouxuanchi/HIV/atac_to_gene_new_data_0218/data/test_adata_process.h5ad")

# 复制 RNA 和 ATAC 部分的数据
paired_rna_data = paired_adata[:,:582].copy()
paired_atac_data = paired_adata[:,582:].copy()
paired_atac_data.var_names = [name.replace(':', '-') for name in paired_atac_data.var_names]

gene_peak = pd.read_csv('/home/share/huadjyin/home/lutianyu/01HIV/02data/peak_gene/peak2gene_peaks_genesplit_filtered.csv')
gene_peak.rename(columns={'Matched_Gene': 'gene_id'}, inplace=True)
gene_peak.rename(columns={'position': 'peak_id'}, inplace=True)
gene_peak['peak_id'] = gene_peak['peak_id'].str.replace(':', '-')

del paired_adata
gc.collect()

# 拿出细胞类型--分组的部分数据
paired_rna_data_cMono_HDs = paired_rna_data[
    (paired_rna_data.obs['celltype_L3'] == 'cMono-CD14') & 
    (paired_rna_data.obs['rna_stage'] == 'HDs')
].copy()

paired_atac_data_cMono_HDs = paired_atac_data[
    (paired_atac_data.obs['celltype_L3'] == 'cMono-CD14') & 
    (paired_atac_data.obs['rna_stage'] == 'HDs')
].copy()


del paired_rna_data # 留下细胞类型-分组的
del paired_atac_data
gc.collect()


#### 2.导入原始的atac数据 (用于背景峰选择和背景峰表达值提取)，也是细胞类型---分组的
atac_adata_original = sc.read_h5ad("/home/share/huadjyin/home/lutianyu/01HIV/02data/atac/paired_ataccell_original_cMono-CD14_HDs.h5ad")
atac_adata_original.var_names = [name.replace(':', '-') for name in atac_adata_original.var_names]


atac_adata = atac_adata_original[paired_atac_data_cMono_HDs.obs['atac_cellname'].tolist(),:] ## 统一排序
atac_adata 

In [None]:
# 检查 atac_adata 中是否有全零的峰
peak_sums = atac_adata.X.sum(axis=0)
if sp.issparse(atac_adata.X):
    peak_sums = peak_sums.A1

zero_peak_indices = np.where(peak_sums == 0)[0]

if len(zero_peak_indices) > 0:
    print(f"\n  找到 {len(zero_peak_indices)} 个总读取数为零的peak，索引是: {zero_peak_indices}")
    print(f"  对应的peak名称是: {atac_adata.var_names[zero_peak_indices].tolist()}")
    valid_peak_mask = (peak_sums > 0)
    atac_adata = atac_adata[:, valid_peak_mask].copy() # 去掉全是0值的peak，否则pychromvar过程会报错
else:
    print("\n  没有发现总读取数为零的峰。")

# 添加背景peak
pychromvar.add_peak_seq(atac_adata, genome_file='/home/share/huadjyin/home/caixianjun/Projects/scRNA/AR_All/code_0506/analy/run_dorc/hg38.fa', delimiter='-')
pychromvar.add_gc_bias(atac_adata)
pychromvar.get_bg_peaks(atac_adata, niterations=100, n_jobs=16)

In [None]:
batch_size = 1 


In [None]:
# 设置 batch_id 和 rna_sample 
paired_rna_data_cMono_HDs.obs['batch_id'] = np.arange(len(paired_rna_data_cMono_HDs)) // batch_size
paired_atac_data_cMono_HDs.obs['batch_id'] = np.arange(len(paired_atac_data_cMono_HDs)) // batch_size


group_keys = ['batch_id']
if 'rna_sample' in paired_rna_data_cMono_HDs.obs.columns:
    paired_rna_data_cMono_HDs.obs['temp_sample'] = paired_rna_data_cMono_HDs.obs['atac_sample']
    paired_atac_data_cMono_HDs.obs['temp_sample'] = paired_atac_data_cMono_HDs.obs['atac_sample']
    group_keys = ['temp_sample', 'batch_id']


# 获取统一的伪 bulk 样本 ID 列表和它们的顺序
unique_rna_groups = paired_rna_data_cMono_HDs.obs.groupby(group_keys).size().index
rna_bulk_sample_names = []
for group in unique_rna_groups:
    if len(group_keys) == 1:
        rna_bulk_sample_names.append(f"batch_{group}")
    else:
        rna_bulk_sample_names.append(f"{group[0]}_batch_{group[1]}")


# 计算伪bulk数据 - RNA
df_rna =paired_rna_data_cMono_HDs[:,'NFKBIA'].to_df()
df_rna['batch_id'] = paired_rna_data_cMono_HDs.obs['batch_id']
df_rna['stage'] = paired_rna_data_cMono_HDs.obs['rna_stage']


df_rna_bulk = df_rna.groupby(['stage', 'batch_id']).sum().reset_index()
df_rna_bulk 

peak_list = gene_peak[gene_peak['gene_id'] == 'NFKBIA']['peak_id'].tolist()
len(peak_list)

In [None]:
# 准备存储结果的列表
results = []

# 遍历 peak_list 中的每个 peak
for peak in peak_list:

    paried_atac_peak = paired_atac_data_cMono_HDs[:, peak].to_df() # 配对数据的peak（观测peak）
    bg_idx = atac_adata.varm['bg_peaks'][atac_adata.var_names.get_loc(peak)] #提取出原始数据中，上面的观测peak对应的背景peak的index
    bg_peak = atac_adata[:, bg_idx].to_df() #提取出原始数据的peak（背景peak）

    atac_adata.obs["rna_stage"] = paired_rna_data_cMono_HDs.obs["rna_stage"].values

    paried_atac_peak['batch_id'] = paired_atac_data_cMono_HDs.obs['batch_id'].values
    bg_peak['batch_id'] = paired_rna_data_cMono_HDs.obs['batch_id'].values
    paried_atac_peak['rna_stage'] = paired_atac_data_cMono_HDs.obs['rna_stage']
    
    bg_peak['rna_stage'] = atac_adata.obs['rna_stage']
    paried_atac_peak_bulk = paried_atac_peak.groupby(['rna_stage', 'batch_id']).sum().reset_index()
    bg_peak_bulk = bg_peak.groupby(['rna_stage', 'batch_id']).sum().reset_index()

    observed_corr = spearmanr(df_rna_bulk['NFKBIA'], paried_atac_peak_bulk[peak]).correlation # df_rna_bulk是配对数据的rna表达值
    
    bg_corrs = []
    for i in range(100):
        bg_corr = spearmanr(df_rna_bulk['NFKBIA'], bg_peak_bulk.iloc[:,i+2]).correlation
        bg_corrs.append(bg_corr)
    
    # 计算统计量
    mean_bg = np.mean(bg_corrs)
    std_bg = np.std(bg_corrs)
    z_score = (observed_corr - mean_bg) / std_bg if std_bg > 0 else 0
    p_value = 2 * (1 - norm.cdf(abs(z_score)))
    
    # 存储结果
    results.append({
        'peak_id': peak,
        'gene_id': 'NFKBIA',
        'observed_correlation': observed_corr,
        'mean_background_correlation': mean_bg,
        'std_background_correlation': std_bg,
        'z_score': z_score,
        'p_value': p_value
    })

In [None]:
# 转换为DataFrame并保存
results_df = pd.DataFrame(results)
output_path = "/home/share/huadjyin/home/lutianyu/01HIV/02data/downstream/permutaion/NFKBIA/cMono-CD14_HDs.csv"
results_df.to_csv(output_path, index=False)

