In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import random
import numpy as np
import scanpy as sc
import math
import time

import warnings
# 禁用所有警告
warnings.filterwarnings("ignore")

In [2]:
import os
def makedir(folder_path):
    # 检查文件夹是否已经存在
    if not os.path.exists(folder_path):
        # 文件夹不存在时创建
        os.makedirs(folder_path)
        print(f"文件夹 '{folder_path}' 已创建")
    else:
        print(f"文件夹 '{folder_path}' 已存在，不进行任何处理")

In [3]:
'''读取背景网络'''
def openPPI(filename):
    '''
        打开PPI
        文件格式 gene1_name gene1_id gene2_name gene2_id
        返回值：网络[节点是gene name]
    '''
    G = nx.Graph()
    a=open(filename,"r")
    next(a)

    for i in a:
        n=i.strip().split("\t")
        G.add_edge(n[0],n[3])
        G.add_node(n[0])
        G.add_node(n[3])
    a.close()
    return G

G = openPPI(r"E:\00_CeSOP\data\network(process_id).txt")

In [4]:
# 计算最大连通分支
def Lcc(G,genename):
    '''
        输入图网络和一个列表，计算最大连通分支lcc
        G：图，这里图中的节点是基因name
        genename：用于计算lcc的genename列表
    '''

    g = nx.subgraph(G,genename)
    if len(genename)==0:
        largest = 0
        l = []
        return l,largest
    else:
        try:
            l = max(nx.connected_components(g),key=len)  #如果随机选取的gene没有连通分支的话，会报错，添加判断条件
            largest = len(l)     #最大联通分支
        except ValueError as e:
            l = genename[0]    # 假设genename列表中的第一个基因为最大连通分支
            largest = 1        #设置最大联通分支数为1
        return list(l),largest

In [5]:
## 计算一组基因集合的表达值均值
def expressValue(genelist,annData_df,cell):
    '''
    :param genelist: 要计算的gene列表
    :param annData_df: 单细胞表达数据的df
    :param cell:  第几个细胞
    :return:
    '''

    # cell_expressedvalue = annData_df.loc[annData_df.index[cell],genelist]
    cell_expressedvalue = annData_df.loc[cell,genelist]
    if len(cell_expressedvalue)==0:
        return 0,0
    else:
        return sum(cell_expressedvalue), sum(cell_expressedvalue)/len(cell_expressedvalue)

In [6]:
# 计算最大连通分支，并返回最大连通分支上的基因表达值之和、均值
def lccExpressedValue(G,genename,annData_df,cell):
    '''
    :param g: 选定的背景网络
    :param genename:
    :param annData_df:
    :param cell:
    :return:
    '''

    lccgenelist, largest = Lcc(G,genename)
    lcc_expresssum,lcc_expressmean = expressValue(lccgenelist,annData_df,cell)
    return lcc_expresssum,lcc_expressmean

In [7]:
def LccExpress_mean(g,genelist,ran,annData_df,cell):
    '''
    每次从网络G的所有基因中随机选择genelist数量的gene，计算均值和标准差
    g:背景网络
    genelist：
    ran：随机次数
    返回：表达值之和、表达值均值的多次随机的均值和标准差
    '''

    all_genes = g.nodes()  # 背景网络中的gene

    #做随机实验
    l_list = []         #lcc列表
    l1_list = []         #lcc列表
    l2_list = []         #lcc列表
    for j in range(ran):
        black_nodes = random.sample(all_genes,len(genelist))
        l,largest_ran = Lcc(g,black_nodes)
        lcc_expresssum,lcc_expressmean = lccExpressedValue(g,black_nodes,annData_df,cell)

        l_list.append(largest_ran)
        l1_list.append(lcc_expresssum)
        l2_list.append(lcc_expressmean)

    #计算lcczscore
    lcc_mean = np.mean(l_list)
    lcc_std  = np.std(l_list)

    lcc1_mean = np.mean(l1_list)
    lcc1_std  = np.std(l1_list)

    lcc2_mean = np.mean(l2_list)
    lcc2_std  = np.std(l2_list)

    return lcc_mean,lcc_std,lcc1_mean,lcc1_std,lcc2_mean,lcc2_std

In [8]:
# 计算lcczscore
def lccZscore(Lcc,lcc_mean,lcc_std):
    if lcc_std == 0:
        zscore = 0
    else:
        zscore = (Lcc - lcc_mean)/lcc_std
        zscore = round(zscore,6)
    return zscore


In [9]:
# 计算某一组基因集合的lccexpresszscore
# G：整个背景网络
# g：随机选择基因集合的背景
def gensetLccExpressZscore(G,g,genelist,ran,annData_df,cell):
    l,lcc = Lcc(G,genelist)
    lcc_expresssum,lcc_expressmean = lccExpressedValue(G,genelist,annData_df,cell)
    lcc_mean,lcc_std,lcc_expresssum_mean,lcc_expresssum_std,lcc_expressmean_mean,lcc_expressmean_std = \
        LccExpress_mean(g,genelist,ran,annData_df,cell)

    lcc_zscore = lccZscore(lcc,lcc_mean,lcc_std)
    lcc_expresssum_zscore = lccZscore(lcc_expresssum,lcc_expresssum_mean,lcc_expresssum_std)
    lcc_expressmean_zscore = lccZscore(lcc_expressmean,lcc_expressmean_mean,lcc_expressmean_mean)

    return lcc_zscore,lcc_expresssum_zscore,lcc_expressmean_zscore

In [12]:
for removal_ratio in range(1,5):  # 设定要删除的比例     
    G = openPPI(r"E:\00_CeSOP\data\network(process_id).txt")
    print(len(G.edges))
    # 计算要删除的边数
    num_edges_to_remove = int(len(G.edges()) * removal_ratio*0.1)               
    # 随机选择要删除的边
    edges_to_remove = random.sample(list(G.edges()), num_edges_to_remove)                
    # 从图中删除这些边
    G.remove_edges_from(edges_to_remove)
    print(len(G.edges))
    print()
    
    

314748
283274

314748
251799

314748
220324

314748
188849



In [14]:
t = 28
for removal_ratio in range(1,5)[0:1]:  # 设定要删除的比例     
    G = openPPI(r"E:\00_CeSOP\data\network(process_id).txt")
    # 计算要删除的边数
    num_edges_to_remove = int(len(G.edges()) * removal_ratio*0.1)               
    # 随机选择要删除的边
    edges_to_remove = random.sample(list(G.edges()), num_edges_to_remove)                
    # 从图中删除这些边
    G.remove_edges_from(edges_to_remove)
        
    # 读取asthma gwas数据中所有基因
    trait = pd.read_csv(r"E:\00_CeSOP\data\poly_posterior_prior_gene\%s_new.txt"%(t+1),sep = "\t")
    trait_coregene = list(pd.read_csv(r"E:\00_CeSOP\data\core_peri_geneset_of_49_traits\core\%s.txt"%(t+1),sep = "\t",header= None)[0])
    trait_core = trait[trait["gene name"].isin(trait_coregene)]
    
    tissuelist = os.listdir(r"E:\00_CeSOP\results\asthma\lcczscore-pip-cell-sixpoints")
    for tissue in tissuelist[0:1]:
        print("//n%s"%tissue)
        # 读取TS FACS单细胞数据
        filePath = r"E:\00_CeSOP\data\TS_FACS\TS_%s.h5ad"%tissue
        annData = sc.read_h5ad(filePath)
        # 按照细胞类型进行分组,使用value_counts()方法计算每个分组中的细胞数量 ,按细胞数量逆序排列分组
        sorted_groups = pd.DataFrame(annData.obs)["cell_ontology_class"].value_counts().sort_values()
        celltypelist = list(sorted_groups.keys())

        ## 分别计算核心和外围
        genelist = trait_coregene

        # 创建结果文件夹
        makedir(r"E:\00_CeSOP\results\robust\network percent\percent%s\%s"%(100-removal_ratio*10,tissue))
        # 细胞类型
        num = 0
        for celltype in celltypelist:
            num +=1
            # 取出 'cd4-positive alpha-beta t cell' 对应的组的数据
            subset_annData = annData[annData.obs['cell_ontology_class'] == celltype, :]
            subset_annData_df = subset_annData.to_df()
            # 有表达的基因
            expressedGenes = subset_annData_df.apply(lambda row: subset_annData_df.columns[row.to_numpy().nonzero()[0]].tolist(), axis=1)
            celllist = list(expressedGenes.index)
            print(celltype,"细胞类型："+str(num)+"//"+str(len(celltypelist))+",细胞数量："+str(len(celllist)),str(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

            # 计算每个细胞中的结果
            count = 0
            result_list = []
            for cell in celllist:   #cell是细胞名字
                count+=1

                # 单细胞有表达的网络基因
                singlecell_network_genelist = list(set(expressedGenes[cell]) & set(list(G.nodes())))
                
                # 构建单细胞子网络
                g = nx.subgraph(G,singlecell_network_genelist)
                core_scnet_genelist = list(set(genelist) & set(singlecell_network_genelist))
                l,lcc = Lcc(G,core_scnet_genelist)
               
                # lcc_expresszscore
                lcczscore,lcc_expresssum_zscore,lcc_expressmean_zscore = \
                    gensetLccExpressZscore(G,g,core_scnet_genelist,20,subset_annData_df,cell)  # 随机50次
                result_list.append([cell,len(core_scnet_genelist),lcc,lcczscore,lcc_expresssum_zscore,lcc_expressmean_zscore])

            df_singcell = pd.DataFrame(result_list)
            df_singcell.columns = ["cell","genenum","lcc","lcczscore","lcc_expresssum_zscore","lcc_expressmean_zscore"]
            df_singcell.to_csv(r"E:\00_CeSOP\results\robust\network percent\percent%s\%s\%s.csv"%(100-removal_ratio*10,tissue,celltype))



//nBladder
文件夹 'E:\00_CeSOP\results\robust\network percent\percent90\Bladder' 已存在，不进行任何处理
plasmacytoid dendritic cell 细胞类型：1//15,细胞数量：18 2024-12-01 12:08:46
endothelial cell of lymphatic vessel 细胞类型：2//15,细胞数量：74 2024-12-01 12:08:47
capillary endothelial cell 细胞类型：3//15,细胞数量：77 2024-12-01 12:08:59
b cell 细胞类型：4//15,细胞数量：253 2024-12-01 12:09:12
vein endothelial cell 细胞类型：5//15,细胞数量：278 2024-12-01 12:09:55
smooth muscle cell 细胞类型：6//15,细胞数量：290 2024-12-01 12:10:42
nk cell 细胞类型：7//15,细胞数量：508 2024-12-01 12:11:33
pericyte cell 细胞类型：8//15,细胞数量：875 2024-12-01 12:13:00
mast cell 细胞类型：9//15,细胞数量：1029 2024-12-01 12:15:01
plasma cell 细胞类型：10//15,细胞数量：1141 2024-12-01 12:16:21
myofibroblast cell 细胞类型：11//15,细胞数量：2078 2024-12-01 12:18:07
t cell 细胞类型：12//15,细胞数量：2916 2024-12-01 12:22:23
bladder urothelial cell 细胞类型：13//15,细胞数量：4151 2024-12-01 12:26:36
macrophage 细胞类型：14//15,细胞数量：5338 2024-12-01 12:35:45
fibroblast 细胞类型：15//15,细胞数量：5557 2024-12-01 12:43:19


In [12]:
import networkx as nx
import random

# 创建一个图（以一个简单的无向图为例）
G = nx.erdos_renyi_graph(n=10, p=0.3)  # 生成一个包含10个节点的图，边的连接概率为0.3

# 输出原图的边
print("原图的边：", list(G.edges()))

# 设定要删除的比例
removal_ratio = 0.2  # 比如删除20%的边

# 计算要删除的边数
num_edges_to_remove = int(len(G.edges()) * removal_ratio)

# 随机选择要删除的边
edges_to_remove = random.sample(list(G.edges()), num_edges_to_remove)

# 从图中删除这些边
G.remove_edges_from(edges_to_remove)

# 输出删除边后的图的边
print("删除边后的图的边：", list(G.edges()))


原图的边： [(0, 9), (2, 4), (3, 7), (3, 9), (4, 6), (4, 7), (5, 6), (5, 7), (6, 7), (7, 8)]
删除边后的图的边： [(0, 9), (2, 4), (3, 7), (3, 9), (4, 6), (5, 7), (6, 7), (7, 8)]
