In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns
import polars as pl
import ipywidgets as widgets
from ipywidgets import interactive

In [2]:
enrichment_result_file_path = "/workspaces/004_foldseek/out/rice_up_custom_enrichment/rice_go_enrichment_up.tsv"

### (1) Data pre-processing

In [3]:
#enrichment result (using GOA tools)
enrichment_result = pl.read_csv(
    enrichment_result_file_path,
    separator="\t"
).rename(
    {
        "# GO": "GO",
        "NS": "GO domain",
        "enrichment" : "enrichment type",
        "name" : "GO term name"
    }
).filter(
    pl.col("enrichment type") == "e"
)

display(enrichment_result)

GO,GO domain,enrichment type,GO term name,ratio_in_study,ratio_in_pop,p_uncorrected,depth,study_count,p_fdr_bh,study_items
str,str,str,str,str,str,f64,i64,i64,f64,str
"""GO:0006457""","""BP""","""e""","""protein folding""","""45/367""","""223/38993""",3.1677e-46,2,45,1.4068e-42,"""Os01g0135800, Os01g0136000, Os…"
"""GO:0009266""","""BP""","""e""","""response to temperature stimul…","""32/367""","""208/38993""",3.8320e-29,3,32,8.5090e-26,"""Os01g0135800, Os01g0136000, Os…"
"""GO:0009408""","""BP""","""e""","""response to heat""","""25/367""","""109/38993""",1.3223e-27,4,25,1.9574e-24,"""Os01g0135800, Os01g0136000, Os…"
"""GO:0042542""","""BP""","""e""","""response to hydrogen peroxide""","""13/367""","""29/38993""",2.1784e-19,5,13,2.4185e-16,"""Os01g0135800, Os01g0136000, Os…"
"""GO:0051259""","""BP""","""e""","""protein complex oligomerizatio…","""13/367""","""30/38993""",3.8118e-19,6,13,3.3856e-16,"""Os01g0135800, Os01g0136000, Os…"
…,…,…,…,…,…,…,…,…,…,…
"""GO:0061134""","""MF""","""e""","""peptidase regulator activity""","""6/367""","""67/38993""",0.000041,3,6,0.005609,"""Os01g0124000, Os01g0124100, Os…"
"""GO:0098772""","""MF""","""e""","""molecular function regulator a…","""16/367""","""525/38993""",0.000046,1,16,0.005955,"""Os01g0124000, Os01g0124100, Os…"
"""GO:0016462""","""MF""","""e""","""pyrophosphatase activity""","""19/367""","""718/38993""",0.000061,5,19,0.0076,"""Os01g0840100, Os02g0115900, Os…"
"""GO:0016818""","""MF""","""e""","""hydrolase activity, acting on …","""19/367""","""725/38993""",0.00007,4,19,0.00824,"""Os01g0840100, Os02g0115900, Os…"


### (2) Fold enrichment

In [4]:
def calculate_ratio(value: str) -> float:
    numerator, denominator = value.split('/')
    return int(numerator) / int(denominator)

enrichment_result = enrichment_result.with_columns(
    pl.col("ratio_in_study").map_elements(
        calculate_ratio,return_dtype=pl.Float64
    ).alias("ratio in study"),
    pl.col("ratio_in_pop").map_elements(
        calculate_ratio,return_dtype=pl.Float64
    ).alias("ratio in population")
)

display(enrichment_result)

GO,GO domain,enrichment type,GO term name,ratio_in_study,ratio_in_pop,p_uncorrected,depth,study_count,p_fdr_bh,study_items,ratio in study,ratio in population
str,str,str,str,str,str,f64,i64,i64,f64,str,f64,f64
"""GO:0006457""","""BP""","""e""","""protein folding""","""45/367""","""223/38993""",3.1677e-46,2,45,1.4068e-42,"""Os01g0135800, Os01g0136000, Os…",0.122616,0.005719
"""GO:0009266""","""BP""","""e""","""response to temperature stimul…","""32/367""","""208/38993""",3.8320e-29,3,32,8.5090e-26,"""Os01g0135800, Os01g0136000, Os…",0.087193,0.005334
"""GO:0009408""","""BP""","""e""","""response to heat""","""25/367""","""109/38993""",1.3223e-27,4,25,1.9574e-24,"""Os01g0135800, Os01g0136000, Os…",0.06812,0.002795
"""GO:0042542""","""BP""","""e""","""response to hydrogen peroxide""","""13/367""","""29/38993""",2.1784e-19,5,13,2.4185e-16,"""Os01g0135800, Os01g0136000, Os…",0.035422,0.000744
"""GO:0051259""","""BP""","""e""","""protein complex oligomerizatio…","""13/367""","""30/38993""",3.8118e-19,6,13,3.3856e-16,"""Os01g0135800, Os01g0136000, Os…",0.035422,0.000769
…,…,…,…,…,…,…,…,…,…,…,…,…
"""GO:0061134""","""MF""","""e""","""peptidase regulator activity""","""6/367""","""67/38993""",0.000041,3,6,0.005609,"""Os01g0124000, Os01g0124100, Os…",0.016349,0.001718
"""GO:0098772""","""MF""","""e""","""molecular function regulator a…","""16/367""","""525/38993""",0.000046,1,16,0.005955,"""Os01g0124000, Os01g0124100, Os…",0.043597,0.013464
"""GO:0016462""","""MF""","""e""","""pyrophosphatase activity""","""19/367""","""718/38993""",0.000061,5,19,0.0076,"""Os01g0840100, Os02g0115900, Os…",0.051771,0.018414
"""GO:0016818""","""MF""","""e""","""hydrolase activity, acting on …","""19/367""","""725/38993""",0.00007,4,19,0.00824,"""Os01g0840100, Os02g0115900, Os…",0.051771,0.018593


In [5]:
# fold enrichment
def fold_enrichment(df):
    fold_enrichment = (pl.col("ratio in study") / pl.col("ratio in population")).alias("fold enrichment")
    df = df.with_columns([
        fold_enrichment
    ]).sort("fold enrichment", descending=True)
    return df

df_fold_enrichment = fold_enrichment(enrichment_result)
display(df_fold_enrichment)

GO,GO domain,enrichment type,GO term name,ratio_in_study,ratio_in_pop,p_uncorrected,depth,study_count,p_fdr_bh,study_items,ratio in study,ratio in population,fold enrichment
str,str,str,str,str,str,f64,i64,i64,f64,str,f64,f64,f64
"""GO:0045471""","""BP""","""e""","""response to ethanol""","""3/367""","""3/38993""",8.2702e-7,5,3,0.000127,"""Os03g0266900, Os03g0267000, Os…",0.008174,0.000077,106.247956
"""GO:0046688""","""BP""","""e""","""response to copper ion""","""3/367""","""4/38993""",0.000003,4,3,0.000442,"""Os03g0266900, Os03g0267000, Os…",0.008174,0.000103,79.685967
"""GO:0046685""","""BP""","""e""","""response to arsenic-containing…","""3/367""","""5/38993""",0.000008,3,3,0.000896,"""Os03g0266900, Os03g0267000, Os…",0.008174,0.000128,63.748774
"""GO:0006880""","""BP""","""e""","""intracellular sequestering of …","""3/367""","""6/38993""",0.000016,5,3,0.001564,"""Os09g0396900, Os11g0106700, Os…",0.008174,0.000154,53.123978
"""GO:0042542""","""BP""","""e""","""response to hydrogen peroxide""","""13/367""","""29/38993""",2.1784e-19,5,13,2.4185e-16,"""Os01g0135800, Os01g0136000, Os…",0.035422,0.000744,47.628394
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""GO:0016818""","""MF""","""e""","""hydrolase activity, acting on …","""19/367""","""725/38993""",0.00007,4,19,0.00824,"""Os01g0840100, Os02g0115900, Os…",0.051771,0.018593,2.784429
"""GO:0016817""","""MF""","""e""","""hydrolase activity, acting on …","""19/367""","""733/38993""",0.000081,3,19,0.009095,"""Os01g0840100, Os02g0115900, Os…",0.051771,0.018798,2.75404
"""GO:0050896""","""BP""","""e""","""response to stimulus""","""66/367""","""2839/38993""",8.8820e-12,1,66,2.1914e-9,"""Os01g0135800, Os01g0136000, Os…",0.179837,0.072808,2.470012
"""GO:0005737""","""CC""","""e""","""cytoplasm""","""52/367""","""2548/38993""",1.7773e-7,2,52,0.00005,"""Os01g0105800, Os01g0135800, Os…",0.141689,0.065345,2.168326


### (3) convert p-value

In [7]:
# convert p-value to -log10(p-value)
if "p_uncorrected" in df_fold_enrichment.columns:
    df_fold_enrichment = df_fold_enrichment.with_columns(
        (-np.log10(df_fold_enrichment["p_uncorrected"]))
        .alias("-log10(p-value_uncorrected)")
    )

if "p_fdr_bh" in df_fold_enrichment.columns:
    df_fold_enrichment = df_fold_enrichment.with_columns(
        (-np.log10(df_fold_enrichment["p_fdr_bh"]))
        .alias("-log10(p-value_fdr_bh)")
        ).sort("-log10(p-value_fdr_bh)", descending=True)
    
df_fold_enrichment = df_fold_enrichment.select(
    "GO",
    "GO term name",
    "GO domain",
    "depth",
    "enrichment type",
    "study_count",
    "ratio_in_study",
    "ratio_in_pop",
    "ratio in study",
    "ratio in population",
    "fold enrichment",
    "p_uncorrected",
    "p_fdr_bh",
    "-log10(p-value_uncorrected)",
    "-log10(p-value_fdr_bh)",
    "study_items"
)

display(df_fold_enrichment)

GO,GO term name,GO domain,depth,enrichment type,study_count,ratio_in_study,ratio_in_pop,ratio in study,ratio in population,fold enrichment,p_uncorrected,p_fdr_bh,-log10(p-value_uncorrected),-log10(p-value_fdr_bh),study_items
str,str,str,i64,str,i64,str,str,f64,f64,f64,f64,f64,f64,f64,str
"""GO:0006457""","""protein folding""","""BP""",2,"""e""",45,"""45/367""","""223/38993""",0.122616,0.005719,21.440171,3.1677e-46,1.4068e-42,45.499254,41.851773,"""Os01g0135800, Os01g0136000, Os…"
"""GO:0051082""","""unfolded protein binding""","""MF""",3,"""e""",30,"""30/367""","""127/38993""",0.081744,0.003257,25.097942,2.5344e-33,6.5793e-30,32.596126,29.181822,"""Os01g0135800, Os01g0136000, Os…"
"""GO:0009266""","""response to temperature stimul…","""BP""",3,"""e""",32,"""32/367""","""208/38993""",0.087193,0.005334,16.345839,3.8320e-29,8.5090e-26,28.416573,25.070122,"""Os01g0135800, Os01g0136000, Os…"
"""GO:0009408""","""response to heat""","""BP""",4,"""e""",25,"""25/367""","""109/38993""",0.06812,0.002795,24.368797,1.3223e-27,1.9574e-24,26.878677,23.708317,"""Os01g0135800, Os01g0136000, Os…"
"""GO:0042542""","""response to hydrogen peroxide""","""BP""",5,"""e""",13,"""13/367""","""29/38993""",0.035422,0.000744,47.628394,2.1784e-19,2.4185e-16,18.661866,15.616445,"""Os01g0135800, Os01g0136000, Os…"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""GO:0098772""","""molecular function regulator a…","""MF""",1,"""e""",16,"""16/367""","""525/38993""",0.043597,0.013464,3.238033,0.000046,0.005955,4.338429,2.225154,"""Os01g0124000, Os01g0124100, Os…"
"""GO:0016462""","""pyrophosphatase activity""","""MF""",5,"""e""",19,"""19/367""","""718/38993""",0.051771,0.018414,2.811575,0.000061,0.0076,4.211247,2.119162,"""Os01g0840100, Os02g0115900, Os…"
"""GO:0016818""","""hydrolase activity, acting on …","""MF""",4,"""e""",19,"""19/367""","""725/38993""",0.051771,0.018593,2.784429,0.00007,0.00824,4.155975,2.084093,"""Os01g0840100, Os02g0115900, Os…"
"""GO:0016817""","""hydrolase activity, acting on …","""MF""",3,"""e""",19,"""19/367""","""733/38993""",0.051771,0.018798,2.75404,0.000081,0.009095,4.093767,2.04119,"""Os01g0840100, Os02g0115900, Os…"


In [8]:
df_fold_enrichment.write_csv(
    f"/workspaces/004_foldseek/out/rice_up_custom_enrichment/rice_go_enrichment_up_all.tsv", 
    separator="\t"
)

cc = df_fold_enrichment.filter((pl.col("GO domain") == "CC"))
cc.write_csv(f"/workspaces/004_foldseek/out/rice_up_custom_enrichment/CC_enrichment.tsv", separator="\t")
bp = df_fold_enrichment.filter((pl.col("GO domain") == "BP"))
bp.write_csv(f"/workspaces/004_foldseek/out/rice_up_custom_enrichment/BP_enrichment.tsv", separator="\t")
mf = df_fold_enrichment.filter((pl.col("GO domain") == "MF"))
mf.write_csv(f"/workspaces/004_foldseek/out/rice_up_custom_enrichment/MF_enrichment.tsv", separator="\t")

In [9]:
# Join term columns
df_to_pd = df_fold_enrichment.with_columns(
    (pl.col("GO") + ": " + pl.col("GO term name")).alias("GO term")
).to_pandas()

#Separate dataframes by GO domain
cc_pd = cc.with_columns(
    (pl.col("GO") + ": " + pl.col("GO term name")).alias("GO term")
).to_pandas()
bp_pd = bp.with_columns(
    (pl.col("GO") + ": " + pl.col("GO term name")).alias("GO term")
).to_pandas()
mf_pd = mf.with_columns(
    (pl.col("GO") + ": " + pl.col("GO term name")).alias("GO term")
).to_pandas()

display(cc_pd.head())

Unnamed: 0,GO,GO term name,GO domain,depth,enrichment type,study_count,ratio_in_study,ratio_in_pop,ratio in study,ratio in population,fold enrichment,p_uncorrected,p_fdr_bh,-log10(p-value_uncorrected),-log10(p-value_fdr_bh),study_items,GO term
0,GO:0005788,endoplasmic reticulum lumen,CC,5,e,6,6/367,24/38993,0.016349,0.000615,26.561989,7.784347e-08,3.3e-05,7.108778,4.480389,"Os02g0115900, Os03g0832200, Os05g0156500, Os05...",GO:0005788: endoplasmic reticulum lumen
1,GO:0005783,endoplasmic reticulum,CC,5,e,22,22/367,587/38993,0.059946,0.015054,3.982036,5.505487e-08,3.3e-05,7.259204,4.480389,"Os01g0606900, Os02g0115900, Os03g0271400, Os03...",GO:0005783: endoplasmic reticulum
2,GO:0005737,cytoplasm,CC,2,e,52,52/367,2548/38993,0.141689,0.065345,2.168326,1.777328e-07,5e-05,6.750232,4.297935,"Os01g0105800, Os01g0135800, Os01g0136000, Os01...",GO:0005737: cytoplasm
3,GO:0034663,endoplasmic reticulum chaperone complex,CC,3,e,3,3/367,8/38993,0.008174,0.000205,39.842984,4.471545e-05,0.009502,4.349542,2.022183,"Os02g0115900, Os02g0710900, Os05g0428600",GO:0034663: endoplasmic reticulum chaperone co...


In [10]:
# enrichment result plot function
current_fig = None # global variable

def plot_goa_dotplot_with_lines(data, col1, goa_col, col2, col3, ylabel_suffix='', width=11, height=14, palette="flare"):

    # 1. figure size
    global current_fig
    figsize = (width, height)
    current_fig, ax = plt.subplots(figsize=figsize)

    # 2. color normalization
    norm = mcolors.Normalize(vmin=data[col3].min(), vmax=data[col3].max())

    # 3.color palette
    color_palette = sns.color_palette(palette, as_cmap=True)

    # 4.plotting
    sns.scatterplot(
        data=data,
        x=col1,
        y=goa_col,
        size=col2, 
        hue=col3,     
        palette=palette, 
        legend='brief',
        ax=ax
    )
    # 5.lines 
    for index, row in data.iterrows():
        ax.plot([0, row[col1]], [row[goa_col], row[goa_col]], color=color_palette(norm(row[col3])), lw=1)
    
    ax.grid(color='b', linestyle=':', linewidth=0.1)
    ax.set_xlabel(col1)
    ylabel_text = 'GO Terms' + ('' + ylabel_suffix  if ylabel_suffix else '' )
    ax.set_ylabel(ylabel_text)

    # 6.legend
    handles, labels = ax.get_legend_handles_labels()
    handles = ["-log10(FDR)" if label == "-log10(p-value_fdr_bh)" else label for label in handles]
    labels = ['Genes' if label == 'study_count' else label for label in labels]
    ax.legend(handles, labels, prop={'size': 14})

    plt.show()

In [None]:
# interactive setting
filename_text = widgets.Text(value='', description='File Name:')

def save_plot(button):
    global current_fig
    if current_fig:
        filename = filename_text.value
        if filename:  
            current_fig.savefig(f'/workspaces/004_foldseek/out/rice_up_custom_enrichment/{filename}.png', dpi=800, bbox_inches='tight')
        else:
            current_fig.savefig(f'/workspaces/004_foldseek/out/rice_up_custom_enrichment/enrichment_result.png', dpi=800, bbox_inches='tight')


save_button = widgets.Button(description="Save Plot")
save_button.on_click(save_plot)

def interactive_plot(data):
    w = interactive(
        plot_goa_dotplot_with_lines,
        data=widgets.fixed(data),
        col1=widgets.Dropdown(options= [('fold enrichment', 'fold enrichment'), 
                                        ('-log10(p-value)', '-log10(p-value_uncorrected)'), 
                                        ('-log10(FDR)', '-log10(p-value_fdr_bh)')], 
                                        value='fold enrichment', 
                                        description='x-axis:'),
        goa_col=widgets.fixed('GO term'),
        col2=widgets.Dropdown(
            options=[('Genes', 'study_count')] + [(col, col) for col in data.columns if col != 'study_count'], 
            value='study_count', 
            description='size:'),
        col3=widgets.Dropdown(options= [('fold enrichment', 'fold enrichment'), 
                                        ('-log10(p-value)', '-log10(p-value_uncorrected)'), 
                                        ('-log10(FDR)', '-log10(p-value_fdr_bh)')], 
                                        value='-log10(p-value_fdr_bh)', 
                                        description='hue:'),
        ylabel_suffix=widgets.Dropdown(
            options=[('', ''), 
                     ('cc', '(Cellular Component)'), 
                     ('bp', '(Biological Process)'), 
                     ('mf', '(Molecular Function)')],
            value='',
            description='y-label suffix:'),
        width=widgets.IntSlider(min=1, max=50, step=1, value=11, description='width:', readout_format='0.0f'),
        height=widgets.IntSlider(min=1, max=50, step=1, value=14, description='height:', readout_format='0.0f'),
        palette=widgets.Dropdown(options=['flare', 'magma', 'viridis'], value='flare', description='color palette:')
        
    )
    display(w, filename_text, save_button)

In [None]:
interactive_plot(bp_pd.copy())

In [None]:
interactive_plot(cc_pd.copy())

In [None]:
interactive_plot(mf_pd.copy())