In [1]:
import altair as alt
import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel("../data/enrichment/pathway_enrichment.xlsx", sheet_name="for_analysis")

In [3]:
data.head()

Unnamed: 0,Pathways,p-values,Abundance,Strain,Matches
0,Purine Riboucleotide De Novo Biosynthesis,0.000308,up,accAgltA,gmk // guaA // guaB // adk // purB // purE // ...
1,Carbohydrate Degradation,0.000393,up,accAgltA,PP_1776 // algC // pgm // pykA // eno // pgk /...
2,Carboxylate Degradation,0.00085,up,accAgltA,purT // acsA-I // PP_2217 // PP_2215 // bktB /...
3,Sugar Degradation,0.000983,up,accAgltA,PP_1776 // algC // pgm // pykA // eno // pgk /...
4,Amino Acid Biosynthesis,0.001607,up,accAgltA,argJ // argA // carA // argB // argC // astC /...


In [4]:
data["num_matches"] = data["Matches"].str.count("//") + 1
data["-logP"] = -np.log10(data["p-values"])
data["Abundance"] = np.where(data["Abundance"] == "up", "increased", "decreased")

In [5]:
data.head()

Unnamed: 0,Pathways,p-values,Abundance,Strain,Matches,num_matches,-logP
0,Purine Riboucleotide De Novo Biosynthesis,0.000308,increased,accAgltA,gmk // guaA // guaB // adk // purB // purE // ...,8,3.511449
1,Carbohydrate Degradation,0.000393,increased,accAgltA,PP_1776 // algC // pgm // pykA // eno // pgk /...,19,3.405607
2,Carboxylate Degradation,0.00085,increased,accAgltA,purT // acsA-I // PP_2217 // PP_2215 // bktB /...,25,3.070581
3,Sugar Degradation,0.000983,increased,accAgltA,PP_1776 // algC // pgm // pykA // eno // pgk /...,17,3.007446
4,Amino Acid Biosynthesis,0.001607,increased,accAgltA,argJ // argA // carA // argB // argC // astC /...,46,2.794097


# Generate enrichment plot

In [6]:
strain = "accAgltA"
plot_df = data.query(f"Strain == '{strain}'")
base_chart = (
    alt.Chart(plot_df, title=f"Enriched patwhays in {strain}")
              .encode(
                  y=alt.Y("Pathways", sort="-x", axis=alt.Axis(title=None)),
                  x=alt.X("-logP", axis=alt.Axis(grid=False)),
                  facet=alt.Facet("Abundance", columns=2, header=alt.Header(labelFontSize=18, titleFontSize=18)),
                  color=alt.condition("datum.Abundance == 'increased'", alt.value("crimson"), alt.value("navy"))
              )
)

(base_chart.mark_circle()
.encode(size="num_matches")
.resolve_scale(y="independent", x="independent")
.configure_axisY(labelLimit=400).configure_axis(
    labelFontSize=15,
    titleFontSize=15,
).configure_view(
    strokeOpacity=0
)
)

# Generate enrichment plot for top 10 hits

In [10]:
strain = "accAgltA"
plot_df = data.query(f"Strain == '{strain}'")
increased = plot_df.query(f"Abundance == 'increased'").sort_values(by=['-logP'],ascending=False).head(10)
decreased = plot_df.query(f"Abundance == 'decreased'").sort_values(by=['-logP'],ascending=False).head(10)
plot_df = pd.concat([increased,decreased])
base_chart = (
    alt.Chart(plot_df, title=f"Enriched patwhays in {strain}")
              .encode(
                  y=alt.Y("Pathways", sort="-x", axis=alt.Axis(title=None)),
                  x=alt.X("-logP", axis=alt.Axis(grid=False)),
                  facet=alt.Facet("Abundance", columns=2, header=alt.Header(labelFontSize=18, titleFontSize=18)),
                  color=alt.condition("datum.Abundance == 'increased'", alt.value("crimson"), alt.value("navy"))
              )
)

(base_chart.mark_circle()
.encode(size="num_matches")
.resolve_scale(y="independent", x="independent")
.configure_axisY(labelLimit=400).configure_axis(
    labelFontSize=15,
    titleFontSize=15,
).configure_view(
    strokeOpacity=0
)
)