In [2]:
import pandas as pd

from Bio import SeqIO
from tqdm.notebook import tqdm
from dataclasses import dataclass
from collections import Counter

from datapaths import *

## Data

In [8]:
@dataclass
class Data:
    name: str
    path_fa: str
    path_bed: str
    path_bed_200: str
    shape: int = None
    path_ccre_intersection: str = None
    path_repeat_intersection: str = None

    def __post_init__(self):
        self.shape = self._get_shape()

    def _get_shape(self):
        with open(self.path_fa, "r") as f_in:
            return sum(1 for _ in f_in) // 2


flipon_data = [
    Data(
        "g4",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_g4.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_g4.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_g4.slop200.bed",
    ),
    Data(
        "sidd",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_sidd.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_sidd.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_sidd.slop200.bed",
    ),
    Data(
        "z-dna",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_z-dna.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_z-dna.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_z-dna.slop200.bed",
    ),
    Data(
        "h-dna",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_h-dna.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_h-dna.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_h-dna.slop200.bed",
    ),
]


In [9]:
def reverse_complement(x: str):
    t_d = {"A": "T", "C": "G", "T": "A", "G": "C", "N": "N"}
    return "".join([t_d[y] for y in x[::-1]])


kwargs = {
    "Seed region": lambda x: x["Seed region"].str.replace("U", "T").str.split(", "),
    "Seed region (RC)": lambda x: x["Seed region"].apply(
        lambda x: [reverse_complement(y) for y in x]
    ),
}
mirna_df = pd.read_table(F_MIRNA_LIST).assign(**kwargs)

print(mirna_df.shape)
mirna_df.iloc[10:20, :]


(177, 5)


Unnamed: 0,Conservation,miRNA gene family,# of genes,Seed region,Seed region (RC)
10,Bilateria,miR-34/449,6,[GGCAGTG],[CACTGCC]
11,Bilateria,miR-96/1271,2,[TTGGCAC],[GTGCCAA]
12,Bilateria,miR-99/100,3,[ACCCGTA],[TACGGGT]
13,Bilateria,miR-124,3,"[AAGGCAC, TAAGGCA]","[GTGCCTT, TGCCTTA]"
14,Bilateria,miR-125,3,[CCCTGAG],[CTCAGGG]
15,Bilateria,miR-133,3,"[TGGTCCC, TTGGTCC]","[GGGACCA, GGACCAA]"
16,Bilateria,miR-153,2,[TGCATAG],[CTATGCA]
17,Bilateria,miR-183,1,"[ATGGCAC, TGGCACT]","[GTGCCAT, AGTGCCA]"
18,Bilateria,miR-184,1,[GGACGGA],[TCCGTCC]
19,Bilateria,miR-190,2,[GATATGT],[ACATATC]


## Flipons to miRNA

In [10]:
def format_mirna_counts(mirna_counts: list):
    """Count each miRNA occurrence for each region."""
    res = []
    for region in mirna_counts:
        temp_res = []
        for mirna_family, counts in region.items():
            temp_res.append(f"{mirna_family} ({counts:,d})")
        res.append(', '.join(sorted(temp_res)))
    return res

In [11]:
flipon_to_mirna_df = pd.DataFrame()

for flipon in tqdm(flipon_data):
    print(f"{flipon.name}: {flipon.shape:,d} regions")

    mirna_list_fwd = [{} for _ in range(flipon.shape)]
    mirna_list_rcm = [{} for _ in range(flipon.shape)]

    fasta = list(SeqIO.parse(flipon.path_fa, "fasta"))
    fasta_fwd_seq = [x.seq.upper() for x in fasta]

    for j, rec_fwd in enumerate(fasta_fwd_seq):
        counter_fwd = Counter()
        counter_rcm = Counter()
        for mirna_family, motiffs_f, motiffs_rc in mirna_df[
            ["miRNA gene family", "Seed region", "Seed region (RC)"]
        ].values:
            for motiff_fwd, motiff_rcm in zip(motiffs_f, motiffs_rc):
                counter_fwd[mirna_family] += rec_fwd.count(motiff_fwd)
                counter_rcm[mirna_family] += rec_fwd.count(motiff_rcm)
        mirna_list_fwd[j] = +counter_fwd
        mirna_list_rcm[j] = +counter_rcm

    kwargs = {
        "miRNA (intersection)": lambda x: x[["miRNA (+)", "miRNA (-)"]].apply(
            lambda y: ", ".join(sorted(set(y[0]) & set(y[1]))), axis=1
        ),
        "miRNA (+)": lambda x: format_mirna_counts(x["miRNA (+)"].values),
        "miRNA (-)": lambda x: format_mirna_counts(x["miRNA (-)"].values),
    }
    df = pd.DataFrame(
        {
            "Flipon": flipon.name,
            "Coordinates": [x.id for x in fasta],
            "miRNA (+)": mirna_list_fwd,
            "miRNA (-)": mirna_list_rcm,
        }
    ).assign(**kwargs)

    flipon_to_mirna_df = pd.concat(
        [flipon_to_mirna_df, df.sort_values("Coordinates")], ignore_index=True
    )

flipon_to_mirna_df.to_csv(F_FLIPON_TO_MIRNA, sep="\t", quoting=2)
flipon_to_mirna_df


  0%|          | 0/4 [00:00<?, ?it/s]

g4: 20,253 regions
20253 20253 20253
sidd: 15,296 regions
15296 15296 15296
z-dna: 25,059 regions
25059 25059 25059
h-dna: 17,100 regions
17100 17100 17100


Unnamed: 0,Flipon,Coordinates,miRNA (+),miRNA (-),miRNA (intersection)
0,g4,chr10:100015775-100015790,,,
1,g4,chr10:100016214-100016246,,miR-149 (1),
2,g4,chr10:10012216-10012266,,,
3,g4,chr10:100147225-100147247,miR-365 (1),,
4,g4,chr10:100401253-100401285,,,
...,...,...,...,...,...
77703,h-dna,chrY:4200098-4200122,,,
77704,h-dna,chrY:4208594-4208610,,,
77705,h-dna,chrY:4223501-4223522,,,
77706,h-dna,chrY:4231086-4231113,,,


## Flipons to Gene Features

In [12]:
flipon_to_gene_df = pd.read_table(F_FLIPON_TO_GENE)
flipon_to_gene_df


Unnamed: 0,Flipon,Coordinates,Gene Name,Gene Strand,Gene Feature
0,g4,chr1:3014794-3014871,4933401J01Rik,+,Distal Intergenic
1,g4,chr1:3099888-3099963,Gm26206,+,Promoter (2-3kb)
2,g4,chr1:3287445-3287468,Gm18956,+,Intron
3,g4,chr1:3472953-3472969,Gm37686,-,Intron
4,g4,chr1:3535948-3535996,Gm7341,+,Intron
...,...,...,...,...,...
77703,z-dna,chrY:3866251-3866287,Gm8521,+,Distal Intergenic
77704,z-dna,chrY:4195990-4196008,Gm29038,-,Distal Intergenic
77705,z-dna,chrY:4202845-4202862,Gm28191,+,Distal Intergenic
77706,z-dna,chrY:4203100-4203146,Gm28191,+,Distal Intergenic


## Flipons to cCRE

In [13]:
for flipon in flipon_data:
    flipon.path_ccre_intersection = D_CCRE / flipon.path_bed_200.name.replace('.bed', '_and_ccre.bed')
    !bedtools intersect -a {flipon.path_bed_200} -b {F_CCRE} -wo > {flipon.path_ccre_intersection}
    !wc -l {flipon.path_ccre_intersection}

14637 /home/fpavlov/projects/article_conserved_miRNA/data/cCRE/mm10.actb_ssdna_enriched_g4.slop200_and_ccre.bed
1683 /home/fpavlov/projects/article_conserved_miRNA/data/cCRE/mm10.actb_ssdna_enriched_sidd.slop200_and_ccre.bed
20638 /home/fpavlov/projects/article_conserved_miRNA/data/cCRE/mm10.actb_ssdna_enriched_z-dna.slop200_and_ccre.bed
3756 /home/fpavlov/projects/article_conserved_miRNA/data/cCRE/mm10.actb_ssdna_enriched_h-dna.slop200_and_ccre.bed


In [27]:
flipon_to_ccre_df = pd.DataFrame()
columns = [
    "flipon_chr",
    "flipon_start",
    "flipon_end",
    "Coordinates",
    "flipon_score",
    "flipon_strand",
    "ccre_chr",
    "ccre_start",
    "ccre_end",
    "ccre",
    "intersection_size",
]

for flipon in flipon_data:
    kwargs = {
        "Flipon": flipon.name,
        "cCRE (+-200bp)": lambda df: df.groupby("Coordinates")["ccre"].transform(
            lambda x: ", ".join(x)
        ),
    }

    df = (
        pd.read_table(flipon.path_ccre_intersection, header=None, names=columns)
        .assign(**kwargs)
        .drop_duplicates(subset=["Coordinates"])[
            ["Flipon", "Coordinates", "cCRE (+-200bp)"]
        ]
    )

    flipon_to_ccre_df = pd.concat([flipon_to_ccre_df, df], ignore_index=True)

flipon_to_ccre_df


Unnamed: 0,Flipon,Coordinates,cCRE (+-200bp)
0,g4,chr1:3671869-3671902,pELS (CTCF-bound)@EM10E0431220
1,g4,chr1:4493714-4493748,dELS@EM10E0431244
2,g4,chr1:4571896-4571924,dELS (CTCF-bound)@EM10E0431262
3,g4,chr1:5018367-5018390,"DNase-H3K4me3 (CTCF-bound)@EM10E0431331, PLS@E..."
4,g4,chr1:5019245-5019272,"pELS@EM10E0431333, PLS (CTCF-bound)@EM10E0431334"
...,...,...,...
24415,h-dna,chrX:152769666-152769687,pELS (CTCF-bound)@EM10E0930778
24416,h-dna,chrX:159987892-159987912,DNase-H3K4me3 (CTCF-bound)@EM10E0931180
24417,h-dna,chrX:161717977-161717993,"PLS@EM10E0931282, pELS@EM10E0931283"
24418,h-dna,chrX:162643118-162643148,"pELS@EM10E0931392, pELS@EM10E0931393, pELS (CT..."


## Flipon to repeats

In [None]:
for flipon in flipon_data:
    flipon.path_repeat_intersection = D_RMSK / flipon.path_bed_200.name.replace('.bed', '_and_repeats.bed')
    !bedtools intersect -a {flipon.path_bed_200} -b {F_RMSK_LINE_LTR} -wo > {flipon.path_repeat_intersection}
    !wc -l {flipon.path_repeat_intersection}

13336 /home/fpavlov/projects/article_conserved_miRNA/data/rmsk/mm10.actb_ssdna_enriched_g4.slop200_and_repeats.bed
23477 /home/fpavlov/projects/article_conserved_miRNA/data/rmsk/mm10.actb_ssdna_enriched_sidd.slop200_and_repeats.bed
11835 /home/fpavlov/projects/article_conserved_miRNA/data/rmsk/mm10.actb_ssdna_enriched_z-dna.slop200_and_repeats.bed
15871 /home/fpavlov/projects/article_conserved_miRNA/data/rmsk/mm10.actb_ssdna_enriched_h-dna.slop200_and_repeats.bed


In [28]:
flipon_to_repeat_df = pd.DataFrame()
columns = [
    "flipon_chr",
    "flipon_start",
    "flipon_end",
    "Coordinates",
    "flipon_score",
    "flipon_strand",
    "rep_chr",
    "rep_start",
    "rep_end",
    "rep_name",
    "rep_score",
    "rep_strand",
    "int_len",
]

for flipon in flipon_data:
    kwargs = {
        "Flipon": flipon.name,
        "LINE/LTR (+-200bp)": lambda df: df.groupby("Coordinates")[
            "rep_name"
        ].transform(lambda x: ", ".join(x)),
    }

    df = (
        pd.read_table(flipon.path_repeat_intersection, header=None, names=columns)
        .assign(**kwargs)
        .drop_duplicates(subset=["Coordinates"])[
            ["Flipon", "Coordinates", "LINE/LTR (+-200bp)"]
        ]
    )

    flipon_to_repeat_df = pd.concat([flipon_to_repeat_df, df], ignore_index=True)

flipon_to_repeat_df


Unnamed: 0,Flipon,Coordinates,LINE/LTR (+-200bp)
0,g4,chr1:3014794-3014871,"LINE@L1Md_F2(-), LINE@L1VL1(+)"
1,g4,chr1:3099888-3099963,"LINE@L1Md_F2(+), LTR@MTC(+)"
2,g4,chr1:3287445-3287468,LINE@L1MD(-)
3,g4,chr1:3472953-3472969,LINE@L1_Mus1(+)
4,g4,chr1:3535948-3535996,"LINE@L1_Mus2(-), LINE@L1VL1(+)"
...,...,...,...
39775,h-dna,chrY:4200098-4200122,LINE@L1MD(+)
39776,h-dna,chrY:4208594-4208610,"LTR@ORR1E(+), LINE@L1Md_T(-)"
39777,h-dna,chrY:4223501-4223522,LINE@L1_Mus2(-)
39778,h-dna,chrY:4231086-4231113,LTR@MuRRS4-int(-)


## Flipon to everything

In [29]:
def get_feature_group(x: list):
    components = []
    if (
        x[0]
        .replace(" (CTCF-bound)", "")
        .replace("CTCF-only", "")
        .replace("CTCF", "")
        .strip(", ")
        != ""
    ):
        components.append("cCRE")
    if x[1] != "":
        components.append("LINE/LTR")
    if "CTCF" in x[0]:
        components.append("CTCF")

    return " & ".join(components)


In [40]:
flipon_to_data = (
    flipon_to_gene_df.merge(
        flipon_to_mirna_df, on=["Flipon", "Coordinates"], how="left"
    )
    .merge(flipon_to_ccre_df, on=["Flipon", "Coordinates"], how="left")
    .merge(flipon_to_repeat_df, on=["Flipon", "Coordinates"], how="left")
    .fillna("")
)
flipon_to_data["Feature Group"] = flipon_to_data.replace(
    regex=r"(@(.+?), )|(@(.+?)$)", value=","
)[["cCRE (+-200bp)", "LINE/LTR (+-200bp)"]].apply(get_feature_group, axis=1)

flipon_to_data


Unnamed: 0,Flipon,Coordinates,Gene Name,Gene Strand,Gene Feature,miRNA (+),miRNA (-),miRNA (intersection),cCRE (+-200bp),LINE/LTR (+-200bp),Feature Group
0,g4,chr1:3014794-3014871,4933401J01Rik,+,Distal Intergenic,miR-328 (1),,,,"LINE@L1Md_F2(-), LINE@L1VL1(+)",LINE/LTR
1,g4,chr1:3099888-3099963,Gm26206,+,Promoter (2-3kb),,miR-328 (1),,,"LINE@L1Md_F2(+), LTR@MTC(+)",LINE/LTR
2,g4,chr1:3287445-3287468,Gm18956,+,Intron,,,,,LINE@L1MD(-),LINE/LTR
3,g4,chr1:3472953-3472969,Gm37686,-,Intron,,,,,LINE@L1_Mus1(+),LINE/LTR
4,g4,chr1:3535948-3535996,Gm7341,+,Intron,miR-129 (1),,,,"LINE@L1_Mus2(-), LINE@L1VL1(+)",LINE/LTR
...,...,...,...,...,...,...,...,...,...,...,...
77703,z-dna,chrY:3866251-3866287,Gm8521,+,Distal Intergenic,,,,,LTR@ORR1E(-),LINE/LTR
77704,z-dna,chrY:4195990-4196008,Gm29038,-,Distal Intergenic,,,,,"LINE@L1_Mus3(+), LTR@IAPLTR3(+)",LINE/LTR
77705,z-dna,chrY:4202845-4202862,Gm28191,+,Distal Intergenic,,,,,"LTR@IAPEY3-int(-), LTR@IAPEY_LTR(-)",LINE/LTR
77706,z-dna,chrY:4203100-4203146,Gm28191,+,Distal Intergenic,,,,,LINE@L1_Mur1(-),LINE/LTR


In [41]:
flipon_beds = pd.DataFrame()
for flipon in flipon_data:
    flipon_bed = pd.read_table(flipon.path_bed, sep='\t', skiprows=1, header=None, names=['chr', 'start', 'end', 'Coordinates', 'score', 'Strand']).assign(Flipon=flipon.name)
    flipon_beds = pd.concat([flipon_beds, flipon_bed], ignore_index=True)

flipon_to_data = flipon_to_data.merge(flipon_beds[['Flipon', 'Coordinates', 'Strand']], on=['Flipon', 'Coordinates'], how='left')

flipon_to_data = flipon_to_data[['Flipon', 'Coordinates', 'Strand', 'Gene Name', 'Gene Strand', 'Gene Feature', 'miRNA (+)', 'miRNA (-)', 'miRNA (intersection)', 'cCRE (+-200bp)', 'LINE/LTR (+-200bp)', 'Feature Group']]    
flipon_to_data    

Unnamed: 0,Flipon,Coordinates,Strand,Gene Name,Gene Strand,Gene Feature,miRNA (+),miRNA (-),miRNA (intersection),cCRE (+-200bp),LINE/LTR (+-200bp),Feature Group
0,g4,chr1:3014794-3014871,-,4933401J01Rik,+,Distal Intergenic,miR-328 (1),,,,"LINE@L1Md_F2(-), LINE@L1VL1(+)",LINE/LTR
1,g4,chr1:3099888-3099963,+,Gm26206,+,Promoter (2-3kb),,miR-328 (1),,,"LINE@L1Md_F2(+), LTR@MTC(+)",LINE/LTR
2,g4,chr1:3287445-3287468,-,Gm18956,+,Intron,,,,,LINE@L1MD(-),LINE/LTR
3,g4,chr1:3472953-3472969,-,Gm37686,-,Intron,,,,,LINE@L1_Mus1(+),LINE/LTR
4,g4,chr1:3535948-3535996,-,Gm7341,+,Intron,miR-129 (1),,,,"LINE@L1_Mus2(-), LINE@L1VL1(+)",LINE/LTR
...,...,...,...,...,...,...,...,...,...,...,...,...
77703,z-dna,chrY:3866251-3866287,.,Gm8521,+,Distal Intergenic,,,,,LTR@ORR1E(-),LINE/LTR
77704,z-dna,chrY:4195990-4196008,.,Gm29038,-,Distal Intergenic,,,,,"LINE@L1_Mus3(+), LTR@IAPLTR3(+)",LINE/LTR
77705,z-dna,chrY:4202845-4202862,.,Gm28191,+,Distal Intergenic,,,,,"LTR@IAPEY3-int(-), LTR@IAPEY_LTR(-)",LINE/LTR
77706,z-dna,chrY:4203100-4203146,.,Gm28191,+,Distal Intergenic,,,,,LINE@L1_Mur1(-),LINE/LTR


In [42]:
# mirna only
(
    flipon_to_data
    .replace("", None)
    .dropna(subset=["miRNA (+)", "miRNA (-)"], how="all")
    .fillna("")
    .to_csv(F_FLIPON_TO_DATA_MIRNA, sep="\t", index=False)
)

flipon_to_data.to_csv(F_FLIPON_TO_DATA, sep='\t', index=False)