# 49个性状
## 不画每个细胞的lcczsore变化曲线，只统计性状核心、外围基因在单个细胞内的表达情况
* lcczscore:  最大连通分支上节点数量的显著性
* weighted-lcczscore: 最大连通分支上表达值的显著性
* expressmean-lcczscore: 最大连通分支上平均每个基因表达的显著性

备注：在服务器上运行

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import random
import numpy as np
import scanpy as sc
import math
import time

import warnings
# 禁用所有警告
warnings.filterwarnings("ignore")

In [None]:
trait = pd.read_csv(r"..\data\poly_posterior_prior_gene_new\%s_new.txt"%(t+1),sep = "\t")

In [2]:
import os
def makedir(folder_path):
    # 检查文件夹是否已经存在
    if not os.path.exists(folder_path):
        # 文件夹不存在时创建
        os.makedirs(folder_path)
        print(f"文件夹 '{folder_path}' 已创建")
    else:
        print(f"文件夹 '{folder_path}' 已存在，不进行任何处理")

In [3]:
'''读取背景网络'''
def openPPI(filename):
    '''
        打开PPI
        文件格式 gene1_name gene1_id gene2_name gene2_id
        返回值：网络[节点是gene name]
    '''
    G = nx.Graph()
    a=open(filename,"r")
    next(a)

    for i in a:
        n=i.strip().split("\t")
        G.add_edge(n[0],n[3])
        G.add_node(n[0])
        G.add_node(n[3])
    a.close()
    return G

G = openPPI(r"..\data\network(process_id).txt")

In [4]:
# 计算最大连通分支
def Lcc(G,genename):
    '''
        输入图网络和一个列表，计算最大连通分支lcc
        G：图，这里图中的节点是基因name
        genename：用于计算lcc的genename列表
    '''

    g = nx.subgraph(G,genename)
    if len(genename)==0:
        largest = 0
        l = []
        return l,largest
    else:
        try:
            l = max(nx.connected_components(g),key=len)  #如果随机选取的gene没有连通分支的话，会报错，添加判断条件
            largest = len(l)     #最大联通分支
        except ValueError as e:
            l = genename[0]    # 假设genename列表中的第一个基因为最大连通分支
            largest = 1        #设置最大联通分支数为1
        return list(l),largest

In [5]:
## 计算一组基因集合的表达值均值
def expressValue(genelist,annData_df,cell):
    '''
    :param genelist: 要计算的gene列表
    :param annData_df: 单细胞表达数据的df
    :param cell:  第几个细胞
    :return:
    '''

    # cell_expressedvalue = annData_df.loc[annData_df.index[cell],genelist]
    cell_expressedvalue = annData_df.loc[cell,genelist]
    if len(cell_expressedvalue)==0:
        return 0,0
    else:
        return sum(cell_expressedvalue), sum(cell_expressedvalue)/len(cell_expressedvalue)

In [6]:
# 计算最大连通分支，并返回最大连通分支上的基因表达值之和、均值
def lccExpressedValue(G,genename,annData_df,cell):
    '''
    :param g: 选定的背景网络
    :param genename:
    :param annData_df:
    :param cell:
    :return:
    '''

    lccgenelist, largest = Lcc(G,genename)
    lcc_expresssum,lcc_expressmean = expressValue(lccgenelist,annData_df,cell)
    return lcc_expresssum,lcc_expressmean

In [7]:
def LccExpress_mean(g,genelist,ran,annData_df,cell):
    '''
    每次从网络G的所有基因中随机选择genelist数量的gene，计算均值和标准差
    g:背景网络
    genelist：
    ran：随机次数
    返回：表达值之和、表达值均值的多次随机的均值和标准差
    '''

    all_genes = g.nodes()  # 背景网络中的gene

    #做随机实验
    l_list = []         #lcc列表
    l1_list = []         #lcc列表
    l2_list = []         #lcc列表
    for j in range(ran):
        black_nodes = random.sample(all_genes,len(genelist))
        l,largest_ran = Lcc(g,black_nodes)
        lcc_expresssum,lcc_expressmean = lccExpressedValue(g,black_nodes,annData_df,cell)

        l_list.append(largest_ran)
        l1_list.append(lcc_expresssum)
        l2_list.append(lcc_expressmean)

    #计算lcczscore
    lcc_mean = np.mean(l_list)
    lcc_std  = np.std(l_list)

    lcc1_mean = np.mean(l1_list)
    lcc1_std  = np.std(l1_list)

    lcc2_mean = np.mean(l2_list)
    lcc2_std  = np.std(l2_list)

    return lcc_mean,lcc_std,lcc1_mean,lcc1_std,lcc2_mean,lcc2_std

In [8]:
# 计算lcczscore
def lccZscore(Lcc,lcc_mean,lcc_std):
    if lcc_std == 0:
        zscore = 0
    else:
        zscore = (Lcc - lcc_mean)/lcc_std
        zscore = round(zscore,6)
    return zscore


In [9]:
# 计算某一组基因集合的lccexpresszscore
# G：整个背景网络
# g：随机选择基因集合的背景
def gensetLccExpressZscore(G,g,genelist,ran,annData_df,cell):
    l,lcc = Lcc(G,genelist)
    lcc_expresssum,lcc_expressmean = lccExpressedValue(G,genelist,annData_df,cell)
    lcc_mean,lcc_std,lcc_expresssum_mean,lcc_expresssum_std,lcc_expressmean_mean,lcc_expressmean_std = \
        LccExpress_mean(g,genelist,ran,annData_df,cell)

    lcc_zscore = lccZscore(lcc,lcc_mean,lcc_std)
    lcc_expresssum_zscore = lccZscore(lcc_expresssum,lcc_expresssum_mean,lcc_expresssum_std)
    lcc_expressmean_zscore = lccZscore(lcc_expressmean,lcc_expressmean_mean,lcc_expressmean_mean)

    return lcc_zscore,lcc_expresssum_zscore,lcc_expressmean_zscore

### 采样策略：等pip采样



In [None]:
for t in range(49):
    # 读取asthma gwas数据中所有基因
    trait = pd.read_csv(r"..\data\poly_posterior_prior_gene_new\%s_new.txt"%(t+1),sep = "\t")
    trait_coregene = list(pd.read_csv(r"..\data\gene set\core\%s.txt"%(t+1),sep = "\t",header= None)[0])
    trait_core = trait[trait["gene name"].isin(trait_coregene)]

    trait_perigene = list(pd.read_csv(r"..\data\gene set\periphery\%s.txt"%(t+1),sep = "\t",header= None)[0])
    trait_peri = trait[trait["gene name"].isin(trait_perigene)]
    
    # filenames = os.listdir(r"E:\0_单细胞1\data\TS_FACS")
    filenames = os.listdir(r"..\data\TS_FACS")
    tissuelist = [f[len("TS_"):-len(".h5ad")] for f in filenames]
    for tissue in tissuelist:
        
        # 读取TS FACS单细胞数据
        filePath = r"..\data\TS_FACS\TS_%s.h5ad"%tissue
        annData = sc.read_h5ad(filePath)
        # 按照细胞类型进行分组,使用value_counts()方法计算每个分组中的细胞数量 ,按细胞数量逆序排列分组
        sorted_groups = pd.DataFrame(annData.obs)["cell_ontology_class"].value_counts().sort_values()
        celltypelist = list(sorted_groups.keys())

        ## 分别计算核心和外围
        for gene in ["core","peri"]:
            print("\n",t+1,tissue,gene)
            # 细胞有表达的asthma核心、外围基因
            if gene == "core":
                genelist = trait_coregene
            elif gene == "peri":
                genelist = trait_perigene
            # 创建结果文件夹
            makedir(r"..\results\lcczscore-pip-coreperi-alltraits\%s\%s\%s"%(gene,t+1,tissue))
            logpath = r"..\results\lcczscore-pip-coreperi-alltraits\%s\%s\%s\log.txt"%(gene,t+1,tissue)
            # 细胞类型
            num = 0
            for celltype in celltypelist:
                num +=1
                with open(logpath, 'a') as r:
                    r.write("\n第" + str(num) + "/" + str(len(celltypelist)) +"个细胞类型:"+ celltype+ ","+str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+"\n")

                # 取出 'cd4-positive alpha-beta t cell' 对应的组的数据
                subset_annData = annData[annData.obs['cell_ontology_class'] == celltype, :]
                subset_annData_df = subset_annData.to_df()
                # 有表达的基因
                expressedGenes = subset_annData_df.apply(lambda row: subset_annData_df.columns[row.to_numpy().nonzero()[0]].tolist(), axis=1)
                celllist = list(expressedGenes.index)
                print(celltype,"细胞类型："+str(num)+"//"+str(len(celltypelist))+",细胞数量："+str(len(celllist)),str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

                # 计算每个细胞中的结果
                count = 0
                result_list = []
                for cell in celllist:   #cell是细胞名字
                    count+=1
                    with open(logpath, 'a') as r:
                        r.write("第" + str(count) + "/" + str(len(celllist)) +"个细胞:"+str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))+"\n")

                    # 单细胞有表达的网络基因
                    singlecell_network_genelist = list(set(expressedGenes[cell]) & set(list(G.nodes())))
                    # 构建单细胞子网络
                    g = nx.subgraph(G,singlecell_network_genelist)
                    core_scnet_genelist = list(set(genelist) & set(singlecell_network_genelist))
                    l,lcc = Lcc(G,core_scnet_genelist)
                   
                    # lcc_expresszscore
                    lcczscore,lcc_expresssum_zscore,lcc_expressmean_zscore = \
                        gensetLccExpressZscore(G,g,core_scnet_genelist,10,subset_annData_df,cell)  # 随机50次
                    result_list.append([cell,len(core_scnet_genelist),lcc,lcczscore,lcc_expresssum_zscore,lcc_expressmean_zscore])

                df_singcell = pd.DataFrame(result_list)
                df_singcell.columns = ["cell","genenum","lcc","lcczscore","lcc_expresssum_zscore","lcc_expressmean_zscore"]
                df_singcell.to_csv(r"..\results\lcczscore-pip-coreperi-alltraits\%s\%s\%s\%s.csv"%(gene,t+1,tissue,celltype))
                r.close()

