In [1]:
import pandas as pd
import numpy as np
import subprocess
import os

In [2]:
# 配置运行路径，需要师哥你自己修改一下
workdir_omic = '/Users/dongjiacheng/Desktop/coder/mtd/code/analysis_module/'

#### 调用R脚本，对输入的基因表达矩阵，进行差异分析，生成差异分析文件

In [3]:
def run_deseq(input_path, output_path, repetition):
    """
    运行R脚本，进行差异分析

    Args:
        input_path: 输入文件路径
        output_path: 输出文件路径
        repetition: 样本重复次数
    
    """

    # R脚本的路径，需要师哥你改路径
    script_path = os.path.join(workdir_omic, 'differential_analysis/Deseq2.R')

    cmd = [
        'Rscript', script_path,
        '--input', input_path,
        '--output', output_path,
        '--repetition', str(repetition),
    ]
    
    # 执行R脚本并捕获输出
    try:
        result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        return result.stdout
    except subprocess.CalledProcessError as e:
        return e.stderrs
    
# 示例调用
output_deseq = run_deseq("input_file/expression_matrix_deseq2.csv", "output_file/deseq2.tsv", 3)

#### 基于Python的Plotly包，对差异分析结果进行可视化

In [3]:
df_deseq2 = pd.read_csv("output_file/deseq2.tsv", sep="\t")
df_deseq2.head()

Unnamed: 0,Gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj
0,MYCTH_2114025,84.477576,1.769935,0.295154,5.996652,2.014266e-09,4.217154e-09
1,MYCTH_2293935,27.971419,-0.833626,0.474323,-1.757506,0.0788316,0.09537086
2,MYCTH_2293936,13.674138,-2.335144,0.702318,-3.32491,0.0008844695,0.001288265
3,MYCTH_2051335,331.950405,-0.889314,0.131604,-6.757494,1.403993e-11,3.266067e-11
4,MYCTH_2293939,805.966778,1.843547,0.107909,17.084273,1.943496e-65,1.8050059999999998e-64


In [None]:
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go

def plot_volcano(df_deseq2, width=1200, height=900 , p_threshold=0.05, logFC_threshold=1, color_schemes = 1, symbol_num=3, bubble_size=5, opacity=0.8, up_donw_info='yes', x_ca='yes'):
    """根据差异分析结果绘制火山图

    Args:
        df_deseq2: 差异分析结果
        width: 图像宽度
        height: 图像高度
        p_threshold: p值阈值
        logFC_threshold: logFC阈值
        color_schemes: 颜色方案
        symbol_num: 显示差异最显著的基因数量
        bubble_size: 气泡大小
        opacity: 气泡透明度
        up_donw_info: 是否显示上调和下调基因的数量
        x_ca: 是否对 logFC范围 进行修正

    Returns:
    """

    # 预处理
    df_deseq2 = df_deseq2[['Gene', 'log2FoldChange', 'padj']].copy()
    df_deseq2.columns = ['Symbol', 'logFC', 'P.adjust']
    df_deseq2.dropna(inplace=True)

    if x_ca == 'yes':
        # 对 logFC 值进行修正
        df_deseq2['logFC'] = np.clip(df_deseq2['logFC'], -6, 6)  # 这里进行了修改

    # 提取数据
    gene_names = df_deseq2['Symbol'].values
    logFC = df_deseq2['logFC'].values

    # p值转换为-log10(p)，并且将超出范围的 p 值修正到 100，将0值修正到 1e-300
    df_deseq2['P.adjust'] = df_deseq2['P.adjust'].replace(0, 1e-300)
    pvalue = -np.log10(df_deseq2['P.adjust'].replace(0, np.nan))
    pvalue_clipped = np.clip(pvalue, 0, 100)

    # 根据阈值筛选差异显著的基因，给差异最显著的基因添加标签
    significant = (np.abs(logFC) > logFC_threshold) & (df_deseq2['P.adjust'].values < p_threshold)
    upregulated = significant & (logFC > 0)
    downregulated = significant & (logFC < 0)
    nonsignificant = ~significant
    pvalue[upregulated | downregulated] = np.minimum(pvalue[upregulated | downregulated], 100)
    df_deseq2_upregulated = df_deseq2[upregulated].nlargest(symbol_num, 'logFC')
    df_deseq2_downregulated = df_deseq2[downregulated].nsmallest(symbol_num, 'logFC')

    # 设置颜色
    if color_schemes == 1:
        up_color='#f08d1a'
        down_color='#7fa4ca'
    elif color_schemes == 2:
        up_color='#f26c6a'
        down_color='#54a857'
    elif color_schemes == 3:
        up_color='#c42121'
        down_color='#15609b'
    elif color_schemes == 4:
        up_color='#df7415'
        down_color='#3d9241'

    fig = go.Figure()
    # opacity: 透明度 line: 点边界的线条属性 sizemode: 指定气泡大小的计算方式
    fig.add_trace(go.Scatter(x=logFC[upregulated], y=pvalue_clipped[upregulated], mode='markers',
                            marker=dict(color=up_color, size=bubble_size, sizemode='area',symbol='circle',opacity=opacity,
                                        # line_dict=1
                                        line=dict(color='black',width=0.4)), name='Up',
                            text=gene_names[upregulated]))
    fig.add_trace(go.Scatter(x=logFC[downregulated], y=pvalue_clipped[downregulated], mode='markers',
                            marker=dict(color=down_color, size=bubble_size, sizemode='area',symbol='circle',opacity=opacity,
                                        line=dict(color='black',width=0.4)), name='Down',
                            text=gene_names[downregulated]))
    fig.add_trace(go.Scatter(x=logFC[nonsignificant], y=pvalue_clipped[nonsignificant], mode='markers',
                            marker=dict(color='#A9A9A9', size=bubble_size, sizemode='area',symbol='circle',opacity=opacity,
                                        line=dict(color='black',width=0.4)), name='Nonsignificant',
                            text=gene_names[nonsignificant]))

    # 添加差异显著的标签信息
    for _, row in df_deseq2_upregulated.iterrows():
        fig.add_trace(go.Scatter(x=[row['logFC']], y=[-np.log10(row['P.adjust'])],
                                 text=[row['Symbol']], mode='text',
                                 textposition="top center",
                                 textfont=dict(size=11),
                                 showlegend=False))
        
    for _, row in df_deseq2_downregulated.iterrows():
        fig.add_trace(go.Scatter(x=[row['logFC']], y=[-np.log10(row['P.adjust'])],
                                 text=[row['Symbol']], mode='text',
                                 textposition="bottom center",
                                 # 定义字体大小
                                 textfont=dict(size=11),
                                 showlegend=False))

    if up_donw_info == 'yes':
    # 计算上调和下调基因的数量，并在图像中右上角添加注释
        upregulated_num = np.sum(upregulated)
        downregulated_num = np.sum(downregulated)
        fig.add_annotation(xref="paper", yref="paper",
                        x=0, y=1, showarrow=False,
                        xanchor='left', yanchor='top',  # 修改这里为 'left'
                        text="Up: {} genes".format(upregulated_num),
                        font=dict(size=16),
                        align="left",  # 修改对齐方式为 'left'
                        bgcolor="white",
                        borderpad=4)
        fig.add_annotation(xref="paper", yref="paper",
                        x=0, y=0.95, showarrow=False,
                        xanchor='left', yanchor='top',  # 修改这里为 'left'
                        text="Down: {} genes".format(downregulated_num),
                        font=dict(size=16),
                        align="left",  # 修改对齐方式为 'left'
                        bgcolor="white",
                        borderpad=4)

    # 添加火山图的阈值线
    x_min = np.min(logFC)
    x_max = np.max(logFC)
    fig.update_layout(shapes=[
        dict(type="line", x0=x_min, x1=x_max, y0=-np.log10(p_threshold), y1=-np.log10(p_threshold), line=dict(color="Black", width=1, dash="dash")),
        dict(type="line", x0=logFC_threshold, x1=logFC_threshold, y0=0, y1=max(pvalue)+10, line=dict(color="Black", width=1, dash="dash")),
        dict(type="line", x0=-logFC_threshold, x1=-logFC_threshold, y0=0, y1=max(pvalue)+10, line=dict(color="Black", width=1, dash="dash"))
    ])

    # 设置图像布局，并限制 y 轴的范围
    fig.update_layout(
    xaxis_title='log2 Fold Change',
    yaxis_title='-log10(p-value)',
    yaxis=dict(range=[0, 105]),  # 增加 y 轴范围以完整显示顶端点
    title='DE Analysis Volcano Plot',
    template="plotly_white",
    height=height,
    width=width,
    )
    # 保存为png，scale设置为4
    fig.write_image(workdir_omic + "differential_analysis/output_file/volcano.png", scale=4)

    # 测试用
    return fig 

    # 方案1:将fig对象转为json
    # fig_json = pio.to_json(fig)
    # return fig_json

    # 方案2:将fig转为html格式，返回html代码
    # fig_html = plot(fig, output_type='div', include_plotlyjs=False)        
    # return fig_html

    # 方案3:将fig转为html格式，保存为html文件
    # fig.write_html("./output-file/kegg.html")
    # return "kegg.html"

# 示例调用
plot_volcano(df_deseq2, 
             p_threshold=0.05, 
             logFC_threshold=1, 
             opacity = 0.8,
             width=1000, height=800,
             color_schemes = 1,
             symbol_num=0, bubble_size=10, 
             up_donw_info='',
             x_ca=''
             )

In [3]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio

def plot_volcano(df_deseq2, width=1200, height=900, p_threshold=0.05, logFC_threshold=1, color_schemes=1,
                 symbol_num=3, bubble_size=5, opacity=0.8, up_donw_info='yes', x_ca='yes', y_ca='no'):
    """根据差异分析结果绘制火山图

    Args:
        df_deseq2 (DataFrame): 差异分析结果
        width (int): 图像宽度
        height (int): 图像高度
        p_threshold (float): p 值阈值
        logFC_threshold (float): logFC 阈值
        color_schemes (int): 颜色方案
        symbol_num (int): 显示差异最显著的基因数量
        bubble_size (int): 气泡大小
        opacity (float): 气泡透明度
        up_donw_info (str): 是否显示上调和下调基因的数量
        x_ca (str): 是否对 logFC 范围进行修正
        y_ca (str): 是否固定 y 轴范围

    Returns:
        Plotly Graph Object: 绘制的火山图
    """
    # 预处理
    df_deseq2 = df_deseq2[['Gene', 'log2FoldChange', 'padj']].copy()
    df_deseq2.columns = ['Symbol', 'logFC', 'P.adjust']
    df_deseq2.dropna(inplace=True)

    if x_ca == 'yes':
        df_deseq2['logFC'] = np.clip(df_deseq2['logFC'], -6, 6)

    # p 值转换为 -log10(p)
    df_deseq2['P.adjust'] = df_deseq2['P.adjust'].replace(0, 1e-300)
    pvalue = -np.log10(df_deseq2['P.adjust'])

    # 根据阈值筛选差异显著的基因
    significant = (np.abs(df_deseq2['logFC']) > logFC_threshold) & (df_deseq2['P.adjust'] < p_threshold)
    upregulated = significant & (df_deseq2['logFC'] > 0)
    downregulated = significant & (df_deseq2['logFC'] < 0)

    # 保存上调和下调基因列表
    df_deseq2[upregulated]['Symbol'].to_csv('upregulated_genes.txt', index=False, header=False)
    df_deseq2[downregulated]['Symbol'].to_csv('downregulated_genes.txt', index=False, header=False)

    # 筛选出标签显示的基因
    if symbol_num > 0:
        df_deseq2['abs_logFC'] = np.abs(df_deseq2['logFC'])
        top_genes = df_deseq2.sort_values(by=['abs_logFC', 'P.adjust'], ascending=[False, True]).head(symbol_num)
    else:
        top_genes = pd.DataFrame()

    # 颜色设置
    colors = {
        1: ('#f08d1a', '#7fa4ca'),
        2: ('#f26c6a', '#54a857'),
        3: ('#c42121', '#15609b'),
        4: ('#df7415', '#3d9241')
    }
    up_color, down_color = colors.get(color_schemes, ('#f08d1a', '#7fa4ca'))

    # 绘制火山图
    fig = go.Figure()
    for name, group in df_deseq2.groupby(significant):
        color = up_color if name else down_color
        fig.add_trace(go.Scatter(x=group['logFC'], y=pvalue[group.index], mode='markers',
                                 marker=dict(color=color, size=bubble_size, opacity=opacity),
                                 name='Upregulated' if name else 'Downregulated'))

    # 添加标签
    for _, row in top_genes.iterrows():
        fig.add_trace(go.Scatter(x=[row['logFC']], y=[-np.log10(row['P.adjust'])],
                                 text=[row['Symbol']], mode='text'))

    # 设置图像布局
    yaxis_range = [0, 105] if y_ca == 'yes' else None
    fig.update_layout(xaxis_title='log2 Fold Change',
                      yaxis_title='-log10(p-value)',
                      yaxis_range=yaxis_range,
                      title='DE Analysis Volcano Plot',
                      height=height,
                      width=width)

    return fig

# 示例调用
plot_volcano(df_deseq2, p_threshold=0.05, logFC_threshold=1, opacity=0.8, width=1000, height=800,
             color_schemes=1, symbol_num=3, up_donw_info='yes', x_ca='yes', y_ca='no')