In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as pgo
import plotly.subplots as ps

from pathlib import Path

pd.set_option('display.max_rows', 2)

In [2]:
d_root = Path("/home/fpavlov/projects/article_conserved_miRNA")
d_data = d_root / "data"
d_img = d_root / "img"

# Input
f_flipon_to_data = d_data / "flipon_to_data.tsv"
f_flipon_to_data_og = d_data / "flipon_to_data_og.tsv"


In [3]:
def get_group(x: list):
    any_ccre = x[0].replace('(CTCF-bound)', '').replace('CTCF-only', '').replace('CTCF', '').strip(", ") != ''
    any_ctcf = "CTCF" in x[0]
    any_reps = x[1] != ""

    return "&".join([any_ccre * "cCRE", any_reps * "Rep", any_ctcf * "CTCF"]).replace(
        "&&", "&"
    )

def get_plotly_data(path: Path) -> pd.DataFrame:
    flipons_to_data = pd.read_table(path).fillna('')
    result = (
        flipons_to_data.replace(regex=r"(@(.+?), )|(@(.+?)$)", value=",")
        .assign(
            feature_group=lambda x: x[
                ["cCRE (+-200bp slop)", "LINE/LTR repeats (+-200bp slop)"]
            ]
            .apply(get_group, axis=1)
            .str.strip("&")
        )
        .loc[:, ["feature_group", "annotation", "group"]]
        .groupby(["annotation", "feature_group"])[["group"]]
        .value_counts()
        .to_frame()
        .rename(columns={0: "count"})
        .reset_index()
    )
    result = result.replace("", "no annotation").replace(
        {"quadruplex": "G4", "z-dna": "Z-DNA", "h-dna": "H-DNA", "sidd": "SIDD"}
    )
    return result


In [4]:
plotly_data = get_plotly_data(f_flipon_to_data)
plotly_data

Unnamed: 0,annotation,feature_group,group,count
0,no annotation,no annotation,Z-DNA,1223
...,...,...,...,...
205,Promoter (<=1kb),cCRE&Rep&CTCF,H-DNA,3


In [5]:
plotly_data_og = get_plotly_data(f_flipon_to_data_og)
plotly_data_og

  flipons_to_data = pd.read_table(path).fillna('')


Unnamed: 0,annotation,feature_group,group,count
0,no annotation,no annotation,Z-DNA,3697
...,...,...,...,...
237,Promoter (<=1kb),cCRE&Rep&CTCF,SIDD,8


In [16]:
groups = ["G4", "Z-DNA", "SIDD", "H-DNA"][::-1]
annotations = [
    "Promoter (<=1kb)",
    "Promoter (1-2kb)",
    "Promoter (2-3kb)",
    "Distal Intergenic",
    "5' UTR",
    "3' UTR",
    "Downstream (<=300bp)",
    "Exon",
    "Intron",
    "no annotation",
]
feature_groups = [
    "cCRE",
    "Rep",
    "cCRE&Rep",
    "CTCF",
    "cCRE&CTCF",
    "Rep&CTCF",
    "cCRE&Rep&CTCF",
    "no annotation",
]

n_rows = len(feature_groups)
n_cols = len(annotations)

fig = ps.make_subplots(
    rows=n_rows,
    cols=n_cols,
    shared_xaxes="all",
    shared_yaxes="all",
    horizontal_spacing=0.004,
    vertical_spacing=0.01,
    column_titles=annotations,
    row_titles=feature_groups,
    x_title="Flipon count",
    # y_title="Flipon group",
)

for i, feature_group in enumerate(feature_groups):
    for j, annotation in enumerate(annotations):
        df = (
            plotly_data[
                (plotly_data["feature_group"] == feature_group)
                & (plotly_data["annotation"] == annotation)
            ]
            .set_index("group")
            .reindex(groups)
            .dropna()
        )

        if not len(df):
            continue

        colors = {
            "G4": "#FE9B54",
            "Z-DNA": "royalblue",
            "SIDD": "#ED4241",
            "H-DNA": "darkorchid",
        }
        # https://stackoverflow.com/questions/70347594/how-to-format-plotly-legend-when-using-marker-color
        for idx, row in df.iterrows():
            sub = pgo.Bar(
                x=[row["count"]],
                y=[idx],
                name=idx,
                textposition="outside" if row["count"] < 1300 else "inside",
                texttemplate="%{x:,d}",
                marker=dict(color=colors[idx]),
                width=1,
                orientation="h",
                showlegend=True if (i == 0 and j == 0) else False,
            )
            fig.add_trace(sub, row=i + 1, col=j + 1)
        fig.update_xaxes(ticks="", row=i + 1, col=j + 1)

fig.update_xaxes(tickformat="s")
fig.update_xaxes(ticks="outside", row=n_rows)
fig.update_yaxes(type="category", visible=False)
fig.update_annotations(textangle=0, font=dict(size=14))

fig.update_layout(
    title=f"Flipons enriched with miRNA seed-regions ({plotly_data['count'].sum():,d})",
    title_x=0.0035,
    title_y=.985,
    height=900,
    width=1800,
    font=dict(family="monospace", size=10),
    uniformtext_minsize=10,
    margin=dict(l=5, t=55, b=55),
    legend=dict(
        # title_text="Flipon group",
        traceorder="reversed",
        orientation="h",
        yanchor="bottom",
        y=1.03,
        xanchor="right",
        x=0.98,
    ),
    template="ggplot2",
)

fig.write_image(d_img / 'enrichment-table-all-flipons-01-mirna-only.png')

fig.show()


In [7]:
groups = ["G4", "Z-DNA", "SIDD", "H-DNA"][::-1]
annotations = [
    "Promoter (<=1kb)",
    "Promoter (1-2kb)",
    "Promoter (2-3kb)",
    "Distal Intergenic",
    "5' UTR",
    "3' UTR",
    "Downstream (<=300bp)",
    "Exon",
    "Intron",
    "no annotation",
]
feature_groups = [
    "cCRE",
    "Rep",
    "cCRE&Rep",
    "CTCF",
    "cCRE&CTCF",
    "Rep&CTCF",
    "cCRE&Rep&CTCF",
    "no annotation",
]

n_rows = len(feature_groups)
n_cols = len(annotations)

fig = ps.make_subplots(
    rows=n_rows,
    cols=n_cols,
    shared_xaxes="all",
    shared_yaxes="all",
    horizontal_spacing=0.004,
    vertical_spacing=0.01,
    column_titles=annotations,
    row_titles=feature_groups,
    x_title="Flipon count",
    # y_title="Flipon group",
)

for i, feature_group in enumerate(feature_groups):
    for j, annotation in enumerate(annotations):
        df = (
            plotly_data_og[
                (plotly_data_og["feature_group"] == feature_group)
                & (plotly_data_og["annotation"] == annotation)
            ]
            .set_index("group")
            .reindex(groups)
            .dropna()
        )

        if not len(df):
            continue

        colors = {
            "G4": "#FE9B54",
            "Z-DNA": "royalblue",
            "SIDD": "#ED4241",
            "H-DNA": "darkorchid",
        }
        # https://stackoverflow.com/questions/70347594/how-to-format-plotly-legend-when-using-marker-color
        for idx, row in df.iterrows():
            sub = pgo.Bar(
                x=[row["count"]],
                y=[idx],
                name=idx,
                textposition="outside" if row["count"] < 1300 else "inside",
                texttemplate="%{x:,d}",
                marker=dict(color=colors[idx]),
                width=1,
                orientation="h",
                showlegend=True if (i == 0 and j == 0) else False,
            )
            fig.add_trace(sub, row=i + 1, col=j + 1)
        fig.update_xaxes(ticks="", row=i + 1, col=j + 1)

fig.update_xaxes(tickformat="s")
fig.update_xaxes(ticks="outside", row=n_rows)
fig.update_yaxes(type="category", visible=False)

fig.update_annotations(textangle=0, font=dict(size=14))
fig.update_layout(
    title=f"Flipons ({plotly_data_og['count'].sum():,d})",
    title_x=0.005,
    height=900,
    width=1800,
    font=dict(family="monospace", size=10),
    uniformtext_minsize=10,
    margin=dict(l=5, t=5, b=5),
    legend=dict(
        title_text="Flipon group",
        traceorder="reversed",
        orientation="h",
        yanchor="bottom",
        y=1.03,
        xanchor="right",
        x=0.98,
    ),
    template="ggplot2",
)

fig.write_image(d_img / 'enrichment-table-all-flipons-01-all.png')

fig.show()
