In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as pgo
import plotly.subplots as ps
import plotly.express as px

from pathlib import Path

pd.set_option('display.max_rows', 4)

d_root = Path("/home/fpavlov/projects/article_conserved_miRNA")
d_data = d_root / "data"
d_img = d_root / "img"

# Input
f_flipon_to_data = d_data / "flipon_to_data.tsv"
f_flipon_to_data_og = d_data / "flipon_to_data_og.tsv"


In [2]:
flipons_to_data = pd.read_table(f_flipon_to_data).fillna('')
flipons_to_data

Unnamed: 0,group,coordinates,miRNA (+ strand),miRNA (- strand),miRNA (intersection),annotation,gene_info,cCRE (+-200bp slop),LINE/LTR repeats (+-200bp slop)
0,sidd,chr10:100119606-100119743,"miR-539, miR-670","miR-143, miR-186, miR-30",,Distal Intergenic,Gm22918 (+),,"LTR@RLTR10C (+), LINE@Lx8b (-), LTR@MTB (+)"
1,sidd,chr10:100146768-100146986,"miR-155, miR-374",miR-448,,Promoter (2-3kb),Gm25287 (+),,LTR@MTB_Mm (-)
...,...,...,...,...,...,...,...,...,...
31684,h-dna,chrY:3865190-3865212,miR-122,,,Distal Intergenic,Gm8521 (+),,
31685,h-dna,chrY:3879642-3879666,miR-185,,,Distal Intergenic,Gm18177 (-),,


In [3]:
form_df_1 = (
    flipons_to_data.query(
        'annotation.str.contains("Promoter") and \
        `cCRE (+-200bp slop)` == "" and \
        `LINE/LTR repeats (+-200bp slop)` != ""'
    )
    .replace(regex=" \(\d+\)", value="")
    .assign(type="Repeats w/o cCREs")
)

form_df_2 = (
    flipons_to_data.query(
        'annotation.str.contains("Promoter") and \
        `cCRE (+-200bp slop)` != "" and \
        `LINE/LTR repeats (+-200bp slop)` == ""'
    )
    .replace(regex=" \(\d+\)", value="")
    .assign(type="cCREs w/o Repeats")
)

form_df = (
    pd.concat([form_df_1, form_df_2], ignore_index=True)
    .assign(
        p=lambda x: x["miRNA (+ strand)"].apply(lambda x: x.split(", ")),
        m=lambda x: x["miRNA (- strand)"].apply(lambda x: x.split(", ")),
        gene_strand=lambda x: x['gene_info'].str[-3:].str[1].apply(lambda y: 'p' if y=="+" else 'm')
    )
    .explode("p")
    .explode("m")
    .loc[:, ["type", "annotation", 'gene_strand', "group", "p", "m"]]
    .melt(
        id_vars=["type", "annotation", "gene_strand", "group"],
        value_vars=["p", "m"],
        var_name="mirna_strand",
        value_name="mirna",
    )
    .assign(
        mirna_gene_strand_orientation = lambda x: x[['gene_strand', 'mirna_strand']].apply(lambda y: 'same' if y[0]==y[1] else 'different', axis=1)
    )
    .groupby(["mirna", "type", "group"])[["mirna_gene_strand_orientation"]]
    .value_counts()
    .to_frame()
    .rename(columns={0: "total_count"})
    .reset_index()
    .replace("", np.nan)
    .dropna()
    .replace('quadruplex', 'g4')
    .sort_values(["mirna", 'type', 'group'], ascending=[False, False, False])
    .reset_index(drop=True)
)

form_df['total_count_signed'] = form_df.apply(lambda x: x.total_count if x.mirna_gene_strand_orientation=="same" else -x.total_count, axis=1)

form_df['total_group_count'] = form_df.groupby('group')['total_count'].transform('sum')
form_df['total_group_count_rel'] = form_df['total_count'] / form_df['total_group_count'] * 100
form_df['total_group_signed_rel'] = form_df.apply(lambda x: x.total_group_count_rel if x.mirna_gene_strand_orientation=="same" else -x.total_group_count_rel, axis=1)

form_df['total_mirna_count'] = form_df.groupby('mirna')['total_count'].transform('sum')
form_df['total_mirna_count_rel'] = form_df['total_count'] / form_df['total_mirna_count'] * 100
form_df['total_mirna_signed_rel'] = form_df.apply(lambda x: x.total_mirna_count_rel if x.mirna_gene_strand_orientation=="same" else -x.total_mirna_count_rel, axis=1)

form_df['total_mirna_group_count'] = form_df.groupby(['mirna', 'group'])['total_count'].transform('sum')
form_df['total_mirna_group_count_rel'] = form_df['total_count'] / form_df['total_mirna_group_count'] * 100
form_df['total_mirna_group_signed_rel'] = form_df.apply(lambda x: x.total_mirna_group_count_rel if x.mirna_gene_strand_orientation=="same" else -x.total_mirna_group_count_rel, axis=1)

form_df['custom_type'] = form_df['type'] + ", " + form_df['mirna_gene_strand_orientation']

form_df


Unnamed: 0,mirna,type,group,mirna_gene_strand_orientation,total_count,total_count_signed,total_group_count,total_group_count_rel,total_group_signed_rel,total_mirna_count,total_mirna_count_rel,total_mirna_signed_rel,total_mirna_group_count,total_mirna_group_count_rel,total_mirna_group_signed_rel,custom_type
0,miR-99/100,cCREs w/o Repeats,z-dna,same,3,3,2129,0.140911,0.140911,8,37.500000,37.500000,4,75.000000,75.000000,"cCREs w/o Repeats, same"
1,miR-99/100,cCREs w/o Repeats,z-dna,different,1,-1,2129,0.046970,-0.046970,8,12.500000,-12.500000,4,25.000000,-25.000000,"cCREs w/o Repeats, different"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1198,let-7/miR-98,Repeats w/o cCREs,sidd,same,6,6,11625,0.051613,0.051613,33,18.181818,18.181818,22,27.272727,27.272727,"Repeats w/o cCREs, same"
1199,let-7/miR-98,Repeats w/o cCREs,h-dna,different,2,-2,422,0.473934,-0.473934,33,6.060606,-6.060606,3,66.666667,-66.666667,"Repeats w/o cCREs, different"


In [4]:
def set_color(inp: str):
    # colors = {"G4": "ff6600", "Z-DNA": "6600ff", "SIDD": "ff0009", "H-DNA": "ef00ff"}
    colors = {"cCREs w/o Repeats": "ff6600", "Repeats w/o cCREs": "6600ff"}
    return colors[inp]

groups = ('g4', 'z-dna', 'sidd', 'h-dna')

big_df = pd.concat([form_df_1, form_df_2])
big_df['group'] = big_df['group'].replace('quadruplex', 'g4')

for group in groups:
    df = big_df[big_df['group']==group]

    out = pd.DataFrame()
    out["Type"] = df["type"]
    out["Shape"] = "circle"
    out = pd.concat(
        [
            out,
            df["coordinates"]
            .str.split(r":|-", regex=True, expand=True)
            .set_axis(["Chr", "Start", "End"], axis=1, copy=False),
        ],
        axis=1,
    )
    out["Chr"] = out["Chr"].str.replace("chr", "")
    out["color"] = out["Type"].apply(set_color)

    out.to_csv(d_data / f"repeats_wo_ccres_{group}.tsv", sep="\t", index=False)

In [32]:
groups = ["g4", "z-dna", "sidd", "h-dna"]
categories = [
    # "Repeats w/o cCREs, same",
    # "Repeats w/o cCREs, different",
    "cCREs w/o Repeats, same",
    "cCREs w/o Repeats, different",
]
colors = {
    # "Repeats w/o cCREs, same": "tomato",
    # "Repeats w/o cCREs, different": "royalblue",
    "cCREs w/o Repeats, same": "tomato",
    "cCREs w/o Repeats, different": "royalblue",
    # "cCREs w/o Repeats, same": "darksalmon",
    # "cCREs w/o Repeats, different": "cornflowerblue",
}

n_rows = 1
n_cols = 4

fig = ps.make_subplots(
    rows=n_rows,
    cols=n_cols,
    shared_xaxes="all",
    shared_yaxes="all",
    horizontal_spacing=0.004,
    vertical_spacing=0.01,
    column_titles=[
        f"<b>{x.upper()}</b> ({form_df[(form_df['group']==x) & form_df['custom_type'].isin(categories)]['total_count'].sum():,d}/{form_df[form_df['group']==x]['total_count'].sum():,d})"
        for x in groups
    ],
    # y_title="Number of miRNAs per flipon",
    # x_title="Flipon count",
)

for i, group in enumerate(groups):
    for category in categories:
        df = form_df[(form_df["group"] == group) & (form_df["custom_type"] == category)]
        sub = pgo.Bar(
            x=df["total_count_signed"],
            y=df["mirna"],
            # text=df['total_count'],
            name=category,
            texttemplate="%{x:,d}",
            textposition=df["total_count_signed"].apply(
                lambda x: "outside" if abs(x) <= 150 else "inside"
            ),
            cliponaxis=False,
            marker=dict(color=colors[category]),
            orientation="h",
            width=0.9,
            showlegend=True if (i == 0) else False,
        )
        fig.add_trace(sub, row=1, col=i + 1)

fig.update_xaxes(tickformat="s", showgrid=False)
fig.update_yaxes(
    ticks="",
    type="category",
    categoryarray=form_df.sort_values("mirna", ascending=False)["mirna"].unique(),
    categoryorder="array",
)
fig.update_annotations(font=dict(size=14))

fig.update_layout(
    # title=f"Flipons enriched with miRNA seed-regions ({plotly_data['count'].sum():,d})",
    # title_x=0.0035,
    # title_y=.985,
    height=3000,
    width=1000,
    barmode="relative",
    font=dict(family="monospace", size=12),
    uniformtext_minsize=12,
    uniformtext_mode="show",
    margin=dict(t=0, r=5, b=5, l=160),
    legend=dict(
        # title_text="Flipon group",
        # traceorder="reversed",
        orientation="h",
        yanchor="bottom",
        y=1.01,
        xanchor="right",
        x=1,
    ),
    template="ggplot2",
)

fig.write_image(d_img / "flipon_to_mirna_mapping-ccre-vs-repeats-strands-01-ccre.png", scale=2)
fig.show()


In [33]:
groups = ["g4", "z-dna", "sidd", "h-dna"]
categories = [
    "Repeats w/o cCREs, same",
    "Repeats w/o cCREs, different",
    # "cCREs w/o Repeats, same",
    # "cCREs w/o Repeats, different",
]
colors = {
    "Repeats w/o cCREs, same": "tomato",
    "Repeats w/o cCREs, different": "royalblue",
    # "cCREs w/o Repeats, same": "tomato",
    # "cCREs w/o Repeats, different": "royalblue",
    # "cCREs w/o Repeats, same": "darksalmon",
    # "cCREs w/o Repeats, different": "cornflowerblue",
}

n_rows = 1
n_cols = 4

fig = ps.make_subplots(
    rows=n_rows,
    cols=n_cols,
    shared_xaxes="all",
    shared_yaxes="all",
    horizontal_spacing=0.004,
    vertical_spacing=0.01,
    column_titles=[
        f"<b>{x.upper()}</b> ({form_df[(form_df['group']==x) & form_df['custom_type'].isin(categories)]['total_count'].sum():,d}/{form_df[form_df['group']==x]['total_count'].sum():,d})"
        for x in groups
    ],
    # y_title="Number of miRNAs per flipon",
    # x_title="Flipon count",
)

for i, group in enumerate(groups):
    for category in categories:
        df = form_df[(form_df["group"] == group) & (form_df["custom_type"] == category)]
        sub = pgo.Bar(
            x=df["total_count_signed"],
            y=df["mirna"],
            # text=df['total_count'],
            name=category,
            texttemplate="%{x:,d}",
            textposition=df["total_count_signed"].apply(
                lambda x: "outside" if abs(x) <= 150 else "inside"
            ),
            cliponaxis=False,
            marker=dict(color=colors[category]),
            orientation="h",
            width=0.9,
            showlegend=True if (i == 0) else False,
        )
        fig.add_trace(sub, row=1, col=i + 1)

fig.update_xaxes(tickformat="s", showgrid=False)
fig.update_yaxes(
    ticks="",
    type="category",
    categoryarray=form_df.sort_values("mirna", ascending=False)["mirna"].unique(),
    categoryorder="array",
)
fig.update_annotations(font=dict(size=14))

fig.update_layout(
    # title=f"Flipons enriched with miRNA seed-regions ({plotly_data['count'].sum():,d})",
    # title_x=0.0035,
    # title_y=.985,
    height=3000,
    width=1000,
    barmode="relative",
    font=dict(family="monospace", size=12),
    uniformtext_minsize=12,
    uniformtext_mode="show",
    margin=dict(t=0, r=5, b=5, l=160),
    legend=dict(
        # title_text="Flipon group",
        # traceorder="reversed",
        orientation="h",
        yanchor="bottom",
        y=1.01,
        xanchor="right",
        x=1,
    ),
    template="ggplot2",
)

fig.write_image(d_img / "flipon_to_mirna_mapping-ccre-vs-repeats-strands-01-repeats.png", scale=2)
fig.show()
