## 计算49个性状与组织和细胞类型关联的得分
将细胞得分的topk细胞作为该性状相关的细胞，计算这些细胞与各个组织中细胞的富集情况

In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import os

import matplotlib.pyplot as plt
#战置全局字体大小为12
plt.rcParams['font.size'] = 14

import warnings
# 禁用所有警告
warnings.filterwarnings("ignore")

In [5]:
import os
def makedir(folder_path):
    # 检查文件夹是否已经存在
    if not os.path.exists(folder_path):
        # 文件夹不存在时创建
        os.makedirs(folder_path)
        print(f"文件夹 '{folder_path}' 已创建")
    else:
        print(f"文件夹 '{folder_path}' 已存在，不进行任何处理")

In [6]:
# 遍历文件夹中的指定格式的文件
def openFolderAll(folder_path,desired_extension):
    filelist = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if os.path.isfile(file_path) and filename.endswith(desired_extension):
            filelist.append(file_path)
    return filelist

In [11]:
core = "core"
# core = "peri"
for trait in range(36,37):
    tissuelist = os.listdir(r"..\results\lcczscore-pip-coreperi-alltraits\%s\%s"%(core,trait))
    results_ct = pd.DataFrame()
    for tissue in tissuelist:
        folder_path = r"..\results\lcczscore-pip-coreperi-alltraits\%s\%s\%s"%(core,trait,tissue)
        # print(folder_path)
        celltypelist = openFolderAll(folder_path,".csv")

        tissue_df = pd.DataFrame()
        for ct in celltypelist:
            celltype = ct.split("\\")[-1].split(".")[0]
            df = pd.read_csv(ct)
            df["celltype"] = celltype
            tissue_df = tissue_df.append(df)
        tissue_df["tissue"] = tissue
        results_ct = results_ct.append(tissue_df)
    results_ct.to_csv(r"..\results\lcczscore-pip-coreperi-alltraits\%s\%s\allcells.csv"%(core,trait))
    print(trait,"finish!")

36 finish!


In [3]:

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    if union == 0:
        return 0
    else:
        return intersection / union

In [12]:
# 计算超几何分布pvalue
from scipy.stats import hypergeom
# Fisher's 精确检验
import numpy as np
from scipy.stats import fisher_exact

# core = "peri"
core = "core"
index = "lcczscore"
# index = "lcc_expresssum_zscore"
# index = "lcc_expressmean_zscore"
for trait in range(36,37):
    makedir(r"..\results\trait-cells-in-tissues-and-celltypes\tissues\%s"%trait)
    results_ct = pd.read_csv(r"..\results\lcczscore-pip-coreperi-alltraits\%s\%s\allcells.csv"%(core,trait))
    results_ct_sorted = results_ct.sort_values(by=index,ascending=False)
    results_ct_sorted_topk = results_ct_sorted.iloc[:int(len(results_ct_sorted) * (10 / 100))][[index,"tissue","celltype","cell"]]
    
    #  计算超几何分布pvalue
    topkcells = set(list(results_ct_sorted_topk["cell"]))
        # filenames = os.listdir(r"E:\0_单细胞1\data\TS_FACS")
    filenames = os.listdir(r"..\data\TS_FACS")
    tissuelist = [f[len("TS_"):-len(".h5ad")] for f in filenames]
    result_index = []
    for tissue in tissuelist:
        tissue_cell = results_ct[results_ct['tissue']==tissue]
        tissue_cell_topk = results_ct_sorted_topk[results_ct_sorted_topk['tissue']==tissue]
        tissue_cellnum = len(tissue_cell)
        tissue_cellnum_topk = len(tissue_cell_topk)
    
        # 计算jaccard系数
        jaccard = jaccard_similarity(set(tissue_cell["cell"]),set(tissue_cell_topk["cell"]))
    
        # 计算fisher精确检验pvalue
        b = tissue_cellnum
        a = tissue_cellnum_topk
        observed_table = [[a, b-a], [np.floor(b/10), b - np.floor(b/10)]]  # 2x2 列联表数据
        odds_ratio, p_value = fisher_exact(observed_table, alternative='greater')
    
        # 所有细胞M中与性状相关的细胞有N个，那么某个组织的n个细胞中有k个性状相关细胞是否显著高
        M = len(results_ct_sorted)
        N = len(results_ct_sorted_topk)
        n = tissue_cellnum
        k = tissue_cellnum_topk
        Hpvalue = hypergeom.sf(k-1,M,N,n)
    
        result_index.append([tissue,tissue_cellnum,tissue_cellnum_topk,
                       jaccard,p_value,Hpvalue])
    
    result_index_df = pd.DataFrame(result_index)
    result_index_df.columns = ["tissue","cellnum","relatedCellnum","jaccard","fisher's pvalue","hypergeom pvalue"]
    result_index_df.to_csv(r"E:\00_CeSOP\results\trait-cells-in-tissues-and-celltypes\tissues\%s\top10_%s_lcczscore.csv"%(trait,core))


文件夹 'E:\00_CeSOP\results\trait-cells-in-tissues-and-celltypes\tissues\36' 已存在，不进行任何处理


In [13]:
## 细胞类型
# core = "peri"
core = "core"
index = "lcczscore"
# index = "lcc_expresssum_zscore"
# index = "lcc_expressmean_zscore"
for trait in range(36,37):
    makedir(r"..\results\trait-cells-in-tissues-and-celltypes\celltypes\%s"%trait)
    results_ct = pd.read_csv(r"..\results\lcczscore-pip-coreperi-alltraits\%s\%s\allcells.csv"%(core,trait))
    
    results_ct_sorted = results_ct.sort_values(by=index,ascending=False)
    results_ct_sorted_topk = results_ct_sorted.iloc[:int(len(results_ct_sorted) * (10 / 100))][[index,"tissue","celltype","cell"]]
    
    #  计算超几何分布pvalue
    topkcells = set(list(results_ct_sorted_topk["cell"]))
    results_ct_group = results_ct.groupby(["tissue","celltype"])
    # tissuelist = os.listdir(r"E:\00_CeSOP\results\asthma\lcczscore-pip-cell-sixpoints")
    result_index = []
    # for tissue in tissuelist:
    
    for index0, group in results_ct_group:
        tissue_cell = results_ct[(results_ct['tissue']==index0[0]) & (results_ct['celltype']==index0[1])]
        tissue_cell_topk = results_ct_sorted_topk[(results_ct_sorted_topk['tissue']==index0[0])  & (results_ct_sorted_topk['celltype']==index0[1])]
        tissue_cellnum = len(tissue_cell)
        tissue_cellnum_topk = len(tissue_cell_topk)
        
        # 计算jaccard系数
        jaccard = jaccard_similarity(set(tissue_cell["cell"]),set(tissue_cell_topk["cell"]))
    
        # 计算fisher精确检验pvalue
        b = tissue_cellnum
        a = tissue_cellnum_topk
        observed_table = [[a, b-a], [np.floor(b/10), b - np.floor(b/10)]]  # 2x2 列联表数据
        odds_ratio, p_value = fisher_exact(observed_table, alternative='greater')
    
        # 所有细胞M中与性状相关的细胞有N个，那么某个组织的n个细胞中有k个性状相关细胞是否显著高
        M = len(results_ct_sorted)
        N = len(results_ct_sorted_topk)
        n = tissue_cellnum
        k = tissue_cellnum_topk
        Hpvalue = hypergeom.sf(k-1,M,N,n)
    
        result_index.append([index0[0],index0[1],tissue_cellnum,tissue_cellnum_topk,
                       jaccard,p_value,Hpvalue])
    
    result_index_df = pd.DataFrame(result_index)
    result_index_df.columns = ["tissue","celltype","cellnum","relatedCellnum","jaccard","fisher's pvalue","hypergeom pvalue"]
    result_index_df.to_csv(r"..\results\trait-cells-in-tissues-and-celltypes\celltypes\%s\top10_%s_lcczscore.csv"%(trait,core))

文件夹 'E:\00_CeSOP\results\trait-cells-in-tissues-and-celltypes\celltypes\36' 已存在，不进行任何处理
