In [1]:
import tempfile
import pandas as pd

from Bio import SeqIO
from tqdm.notebook import tqdm
from dataclasses import dataclass
from collections import Counter

from datapaths import *

## Data

In [2]:
@dataclass
class Data:
    name: str
    path_fa: str
    path_bed: str
    path_bed_200: str
    shape: int = None
    path_ccre_intersection: str = None
    path_repeat_intersection: str = None

    def __post_init__(self):
        self.shape = self._get_shape()

    def _get_shape(self):
        with open(self.path_fa, "r") as f_in:
            return sum(1 for _ in f_in) // 2


flipon_data = [
    Data(
        "g4",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_g4.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_g4.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_g4.slop200.bed",
    ),
    Data(
        "sidd",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_sidd.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_sidd.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_sidd.slop200.bed",
    ),
    Data(
        "z-dna",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_z-dna.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_z-dna.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_z-dna.slop200.bed",
    ),
    Data(
        "h-dna",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_h-dna.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_h-dna.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_h-dna.slop200.bed",
    ),
]


## Flipons to miR

In [3]:
flipon_to_mirna_df = pd.read_table(F_FLIPON_TO_MIRNA)
flipon_to_mirna_df


  flipon_to_mirna_df = pd.read_table(F_FLIPON_TO_MIRNA)


Unnamed: 0,Flipon,Coordinates,c. M miR (+),c. M miR (-),c. MJ miR (+),c. MJ miR (-),c. J miR (+),c. J miR (-),M miR (+),M miR (-),MJ miR (+),MJ miR (-),J miR (+),J miR (-)
0,g4,chr10:100016214-100016246,,miR-149 (1),,miR-331 (1),,,"miR-6943 (1), miR-6975/miR-7005 (1), miR-7034/...","miR-149 (1), miR-1894 (1)","miR-6911/miR-7028/miR-7079/miR-7662 (1), miR-6...",miR-6904/miR-6914 (1),miR-883a (1),
1,g4,chr10:100147225-100147247,miR-365 (1),,,,,,miR-6920 (1),"miR-6418 (1), miR-7001 (1), miR-7030/miR-7075/...",,"miR-698/miR-7078 (1), miR-3087 (1), miR-7652 (1)",miR-770 (1),miR-3102-5p.2 (1)
2,g4,chr10:100661290-100661309,miR-185 (1),,,,,,"miR-185 (1), miR-1249 (1), miR-1943/miR-6967/m...",,"miR-1249 (1), miR-1943/miR-6967/miR-7016 (2), ...",,,
3,g4,chr10:100661317-100661348,miR-185 (1),,miR-185 (1),,,,"miR-185 (1), miR-1249 (2), miR-1943/miR-6967/m...",miR-12183 (1),"miR-185 (1), miR-1249 (1), miR-1943/miR-6967/m...",,,
4,g4,chr10:100926057-100926093,,miR-328 (1),,miR-486 (1),,,"miR-698/miR-7078 (1), miR-3572 (1), miR-6956 (...","miR-23a (1), miR-6904/miR-6914 (1)","miR-3076 (1), miR-6418 (1)",miR-486a/miR-486b (1),,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74542,z-dna,chrX:74841465-74841490,,,,,,,,,,,,miR-669k (1)
74543,z-dna,chrX:86192448-86192467,,,,,,,,,,,,miR-7064 (1)
74544,z-dna,chrX:9200174-9200187,,,,,,,,,,,miR-6994 (1),
74545,z-dna,chrX:9387131-9387148,,,,,,,,,,,,miR-5627 (1)


## Flipons to Gene Features

In [4]:
flipon_to_gene_df = pd.read_table(F_FLIPON_TO_GENE)
flipon_to_gene_df


Unnamed: 0,Flipon,Coordinates,Gene Feature,Gene Name,Gene Strand,Gene Type,Gene ID,Transcript ID
0,g4,chr1:3014794-3014871,Distal Intergenic,4933401J01Rik,+,TEC,ENSMUSG00000102693.1,ENSMUST00000193812.1
1,g4,chr1:3099888-3099963,Promoter (2-3kb),Gm26206,+,snRNA,ENSMUSG00000064842.1,ENSMUST00000082908.1
2,g4,chr1:3287445-3287468,Intron,Gm18956,+,processed_pseudogene,ENSMUSG00000102851.1,ENSMUST00000192857.1
3,g4,chr1:3472953-3472969,Intron,Gm37686,-,TEC,ENSMUSG00000103025.1,ENSMUST00000194099.1
4,g4,chr1:3535948-3535996,Intron,Gm7341,+,processed_pseudogene,ENSMUSG00000103147.1,ENSMUST00000192183.1
...,...,...,...,...,...,...,...,...
77703,z-dna,chrY:3866251-3866287,Distal Intergenic,Gm8521,+,unprocessed_pseudogene,ENSMUSG00000099838.1,ENSMUST00000190394.1
77704,z-dna,chrY:4195990-4196008,Distal Intergenic,Gm29038,-,unprocessed_pseudogene,ENSMUSG00000101108.1,ENSMUST00000191543.1
77705,z-dna,chrY:4202845-4202862,Distal Intergenic,Gm28191,+,unprocessed_pseudogene,ENSMUSG00000100300.1,ENSMUST00000189112.1
77706,z-dna,chrY:4203100-4203146,Distal Intergenic,Gm28191,+,unprocessed_pseudogene,ENSMUSG00000100300.1,ENSMUST00000189112.1


## Flipons to cCRE

In [5]:
for flipon in flipon_data:
    flipon.path_ccre_intersection = D_CCRE / flipon.path_bed_200.name.replace('.bed', '_and_ccre.bed')
    !bedtools intersect -a {flipon.path_bed_200} -b {F_CCRE} -wo > {flipon.path_ccre_intersection}
    !wc -l {flipon.path_ccre_intersection}

14637 /home/fpavlov/projects/article_conserved_miRNA/data/cCRE/mm10.actb_ssdna_enriched_g4.slop200_and_ccre.bed
1683 /home/fpavlov/projects/article_conserved_miRNA/data/cCRE/mm10.actb_ssdna_enriched_sidd.slop200_and_ccre.bed
20638 /home/fpavlov/projects/article_conserved_miRNA/data/cCRE/mm10.actb_ssdna_enriched_z-dna.slop200_and_ccre.bed
3756 /home/fpavlov/projects/article_conserved_miRNA/data/cCRE/mm10.actb_ssdna_enriched_h-dna.slop200_and_ccre.bed


In [6]:
flipon_to_ccre_df = pd.DataFrame()
columns = [
    "flipon_chr",
    "flipon_start",
    "flipon_end",
    "Coordinates",
    "flipon_score",
    "flipon_strand",
    "ccre_chr",
    "ccre_start",
    "ccre_end",
    "ccre",
    "intersection_size",
]

for flipon in flipon_data:
    kwargs = {
        "Flipon": flipon.name,
        "cCRE (+-200bp)": lambda df: df.groupby("Coordinates")["ccre"].transform(
            lambda x: ", ".join(x)
        ),
    }

    df = (
        pd.read_table(flipon.path_ccre_intersection, header=None, names=columns)
        .assign(**kwargs)
        .drop_duplicates(subset=["Coordinates"])[
            ["Flipon", "Coordinates", "cCRE (+-200bp)"]
        ]
    )

    flipon_to_ccre_df = pd.concat([flipon_to_ccre_df, df], ignore_index=True)

flipon_to_ccre_df


Unnamed: 0,Flipon,Coordinates,cCRE (+-200bp)
0,g4,chr1:3671869-3671902,pELS (CTCF-bound)@EM10E0431220
1,g4,chr1:4493714-4493748,dELS@EM10E0431244
2,g4,chr1:4571896-4571924,dELS (CTCF-bound)@EM10E0431262
3,g4,chr1:5018367-5018390,"DNase-H3K4me3 (CTCF-bound)@EM10E0431331, PLS@E..."
4,g4,chr1:5019245-5019272,"pELS@EM10E0431333, PLS (CTCF-bound)@EM10E0431334"
...,...,...,...
24415,h-dna,chrX:152769666-152769687,pELS (CTCF-bound)@EM10E0930778
24416,h-dna,chrX:159987892-159987912,DNase-H3K4me3 (CTCF-bound)@EM10E0931180
24417,h-dna,chrX:161717977-161717993,"PLS@EM10E0931282, pELS@EM10E0931283"
24418,h-dna,chrX:162643118-162643148,"pELS@EM10E0931392, pELS@EM10E0931393, pELS (CT..."


## Flipon to repeats

In [7]:
for flipon in flipon_data:
    flipon.path_repeat_intersection = D_RMSK / flipon.path_bed_200.name.replace('.bed', '_and_repeats.bed')
    !bedtools intersect -a {flipon.path_bed_200} -b {F_RMSK_LINE_LTR} -wo > {flipon.path_repeat_intersection}
    !wc -l {flipon.path_repeat_intersection}

13336 /home/fpavlov/projects/article_conserved_miRNA/data/rmsk/mm10.actb_ssdna_enriched_g4.slop200_and_repeats.bed
23477 /home/fpavlov/projects/article_conserved_miRNA/data/rmsk/mm10.actb_ssdna_enriched_sidd.slop200_and_repeats.bed
11835 /home/fpavlov/projects/article_conserved_miRNA/data/rmsk/mm10.actb_ssdna_enriched_z-dna.slop200_and_repeats.bed
15871 /home/fpavlov/projects/article_conserved_miRNA/data/rmsk/mm10.actb_ssdna_enriched_h-dna.slop200_and_repeats.bed


In [8]:
flipon_to_repeat_df = pd.DataFrame()
columns = [
    "flipon_chr",
    "flipon_start",
    "flipon_end",
    "Coordinates",
    "flipon_score",
    "flipon_strand",
    "rep_chr",
    "rep_start",
    "rep_end",
    "rep_name",
    "rep_score",
    "rep_strand",
    "int_len",
]

for flipon in flipon_data:
    kwargs = {
        "Flipon": flipon.name,
        "LINE/LTR (+-200bp)": lambda df: df.groupby("Coordinates")[
            "rep_name"
        ].transform(lambda x: ", ".join(x)),
    }

    df = (
        pd.read_table(flipon.path_repeat_intersection, header=None, names=columns)
        .assign(**kwargs)
        .drop_duplicates(subset=["Coordinates"])[
            ["Flipon", "Coordinates", "LINE/LTR (+-200bp)"]
        ]
    )

    flipon_to_repeat_df = pd.concat([flipon_to_repeat_df, df], ignore_index=True)

flipon_to_repeat_df


Unnamed: 0,Flipon,Coordinates,LINE/LTR (+-200bp)
0,g4,chr1:3014794-3014871,"LINE@L1Md_F2(-), LINE@L1VL1(+)"
1,g4,chr1:3099888-3099963,"LINE@L1Md_F2(+), LTR@MTC(+)"
2,g4,chr1:3287445-3287468,LINE@L1MD(-)
3,g4,chr1:3472953-3472969,LINE@L1_Mus1(+)
4,g4,chr1:3535948-3535996,"LINE@L1_Mus2(-), LINE@L1VL1(+)"
...,...,...,...
39775,h-dna,chrY:4200098-4200122,LINE@L1MD(+)
39776,h-dna,chrY:4208594-4208610,"LTR@ORR1E(+), LINE@L1Md_T(-)"
39777,h-dna,chrY:4223501-4223522,LINE@L1_Mus2(-)
39778,h-dna,chrY:4231086-4231113,LTR@MuRRS4-int(-)


## Flipon to everything

In [9]:
def get_feature_group(x: list):
    components = []
    if (
        x[0]
        .replace(" (CTCF-bound)", "")
        .replace("CTCF-only", "")
        .replace("CTCF", "")
        .strip(", ")
        != ""
    ):
        components.append("cCRE")
    if x[1] != "":
        components.append("LINE/LTR")
    if "CTCF" in x[0]:
        components.append("CTCF")

    return " & ".join(components)


In [10]:
flipon_to_data = (
    flipon_to_gene_df.merge(
        flipon_to_mirna_df, on=["Flipon", "Coordinates"], how="left"
    )
    .merge(flipon_to_ccre_df, on=["Flipon", "Coordinates"], how="left")
    .merge(flipon_to_repeat_df, on=["Flipon", "Coordinates"], how="left")
    .fillna("")
)
flipon_to_data["Feature Group"] = flipon_to_data.replace(
    regex=r"(@(.+?), )|(@(.+?)$)", value=","
)[["cCRE (+-200bp)", "LINE/LTR (+-200bp)"]].apply(get_feature_group, axis=1)

flipon_to_data


Unnamed: 0,Flipon,Coordinates,Gene Feature,Gene Name,Gene Strand,Gene Type,Gene ID,Transcript ID,c. M miR (+),c. M miR (-),...,c. J miR (-),M miR (+),M miR (-),MJ miR (+),MJ miR (-),J miR (+),J miR (-),cCRE (+-200bp),LINE/LTR (+-200bp),Feature Group
0,g4,chr1:3014794-3014871,Distal Intergenic,4933401J01Rik,+,TEC,ENSMUSG00000102693.1,ENSMUST00000193812.1,miR-328 (1),,...,,"miR-6994 (1), miR-129b (1)","miR-92a-2 (1), miR-698/miR-7078 (1), miR-3572 ...",miR-486a/miR-486b (1),"miR-3076 (1), miR-6418 (1), miR-6987 (1)",,miR-7649 (1),,"LINE@L1Md_F2(-), LINE@L1VL1(+)",LINE/LTR
1,g4,chr1:3099888-3099963,Promoter (2-3kb),Gm26206,+,snRNA,ENSMUSG00000064842.1,ENSMUST00000082908.1,,miR-328 (1),...,,"miR-365-1/miR-365-2 (1), miR-698/miR-7078 (1),...","miR-221 (1), miR-129b (1)","miR-6418 (1), miR-7067 (1)",,,,,"LINE@L1Md_F2(+), LTR@MTC(+)",LINE/LTR
2,g4,chr1:3287445-3287468,Intron,Gm18956,+,processed_pseudogene,ENSMUSG00000102851.1,ENSMUST00000192857.1,,,...,,,"miR-6971 (2), miR-7030/miR-7075/miR-7076 (2)",,"miR-7058 (1), miR-7073 (1), miR-7074 (1), miR-...",,miR-7012/miR-7057 (1),,LINE@L1MD(-),LINE/LTR
3,g4,chr1:3472953-3472969,Intron,Gm37686,-,TEC,ENSMUSG00000103025.1,ENSMUST00000194099.1,,,...,,,"miR-6971 (1), miR-7030/miR-7075/miR-7076 (1)",,,,,,LINE@L1_Mus1(+),LINE/LTR
4,g4,chr1:3535948-3535996,Intron,Gm7341,+,processed_pseudogene,ENSMUSG00000103147.1,ENSMUST00000192183.1,miR-129 (1),,...,,miR-320 (1),"miR-3104 (1), miR-6986 (1), miR-6988 (2), miR-...",,"miR-698/miR-7078 (1), miR-6956 (1)",,,,"LINE@L1_Mus2(-), LINE@L1VL1(+)",LINE/LTR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77703,z-dna,chrY:3866251-3866287,Distal Intergenic,Gm8521,+,unprocessed_pseudogene,ENSMUSG00000099838.1,ENSMUST00000190394.1,,,...,,,"miR-466d/miR-466i (4), miR-466l (4)",,miR-669a/miR-669f/miR-669l/miR-669p (1),,miR-592 (1),,LTR@ORR1E(-),LINE/LTR
77704,z-dna,chrY:4195990-4196008,Distal Intergenic,Gm29038,-,unprocessed_pseudogene,ENSMUSG00000101108.1,ENSMUST00000191543.1,,,...,,"miR-7060 (1), miR-7658 (1)",,miR-337 (1),miR-378a (1),,,,"LINE@L1_Mus3(+), LTR@IAPLTR3(+)",LINE/LTR
77705,z-dna,chrY:4202845-4202862,Distal Intergenic,Gm28191,+,unprocessed_pseudogene,ENSMUSG00000100300.1,ENSMUST00000189112.1,,,...,,,,,miR-26a/miR-26b (1),miR-547 (1),,,"LTR@IAPEY3-int(-), LTR@IAPEY_LTR(-)",LINE/LTR
77706,z-dna,chrY:4203100-4203146,Distal Intergenic,Gm28191,+,unprocessed_pseudogene,ENSMUSG00000100300.1,ENSMUST00000189112.1,,,...,,"miR-466d/miR-466i (4), miR-466l (5)",,miR-493 (1),,,,,LINE@L1_Mur1(-),LINE/LTR


In [11]:
flipon_beds = pd.DataFrame()
for flipon in flipon_data:
    flipon_bed = pd.read_table(
        flipon.path_bed,
        sep="\t",
        skiprows=1,
        header=None,
        names=["chr", "start", "end", "Coordinates", "score", "Strand"],
    ).assign(Flipon=flipon.name)
    flipon_beds = pd.concat([flipon_beds, flipon_bed], ignore_index=True)

flipon_to_data = flipon_to_data.merge(
    flipon_beds[["Flipon", "Coordinates", "Strand"]],
    on=["Flipon", "Coordinates"],
    how="left",
)[
    [
        "Flipon",
        "Coordinates",
        "Strand",
        "Gene Feature",
        "Gene Name",
        "Gene Strand",
        "Gene Type",
        "Gene ID",
        "Transcript ID",
        "c. M miR (+)",
        "c. M miR (-)",
        "c. MJ miR (+)",
        "c. MJ miR (-)",
        "c. J miR (+)",
        "c. J miR (-)",
        "M miR (+)",
        "M miR (-)",
        "MJ miR (+)",
        "MJ miR (-)",
        "J miR (+)",
        "J miR (-)",
        "cCRE (+-200bp)",
        "LINE/LTR (+-200bp)",
        "Feature Group",
    ]
]
flipon_to_data


Unnamed: 0,Flipon,Coordinates,Strand,Gene Feature,Gene Name,Gene Strand,Gene Type,Gene ID,Transcript ID,c. M miR (+),...,c. J miR (-),M miR (+),M miR (-),MJ miR (+),MJ miR (-),J miR (+),J miR (-),cCRE (+-200bp),LINE/LTR (+-200bp),Feature Group
0,g4,chr1:3014794-3014871,-,Distal Intergenic,4933401J01Rik,+,TEC,ENSMUSG00000102693.1,ENSMUST00000193812.1,miR-328 (1),...,,"miR-6994 (1), miR-129b (1)","miR-92a-2 (1), miR-698/miR-7078 (1), miR-3572 ...",miR-486a/miR-486b (1),"miR-3076 (1), miR-6418 (1), miR-6987 (1)",,miR-7649 (1),,"LINE@L1Md_F2(-), LINE@L1VL1(+)",LINE/LTR
1,g4,chr1:3099888-3099963,+,Promoter (2-3kb),Gm26206,+,snRNA,ENSMUSG00000064842.1,ENSMUST00000082908.1,,...,,"miR-365-1/miR-365-2 (1), miR-698/miR-7078 (1),...","miR-221 (1), miR-129b (1)","miR-6418 (1), miR-7067 (1)",,,,,"LINE@L1Md_F2(+), LTR@MTC(+)",LINE/LTR
2,g4,chr1:3287445-3287468,-,Intron,Gm18956,+,processed_pseudogene,ENSMUSG00000102851.1,ENSMUST00000192857.1,,...,,,"miR-6971 (2), miR-7030/miR-7075/miR-7076 (2)",,"miR-7058 (1), miR-7073 (1), miR-7074 (1), miR-...",,miR-7012/miR-7057 (1),,LINE@L1MD(-),LINE/LTR
3,g4,chr1:3472953-3472969,-,Intron,Gm37686,-,TEC,ENSMUSG00000103025.1,ENSMUST00000194099.1,,...,,,"miR-6971 (1), miR-7030/miR-7075/miR-7076 (1)",,,,,,LINE@L1_Mus1(+),LINE/LTR
4,g4,chr1:3535948-3535996,-,Intron,Gm7341,+,processed_pseudogene,ENSMUSG00000103147.1,ENSMUST00000192183.1,miR-129 (1),...,,miR-320 (1),"miR-3104 (1), miR-6986 (1), miR-6988 (2), miR-...",,"miR-698/miR-7078 (1), miR-6956 (1)",,,,"LINE@L1_Mus2(-), LINE@L1VL1(+)",LINE/LTR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77703,z-dna,chrY:3866251-3866287,.,Distal Intergenic,Gm8521,+,unprocessed_pseudogene,ENSMUSG00000099838.1,ENSMUST00000190394.1,,...,,,"miR-466d/miR-466i (4), miR-466l (4)",,miR-669a/miR-669f/miR-669l/miR-669p (1),,miR-592 (1),,LTR@ORR1E(-),LINE/LTR
77704,z-dna,chrY:4195990-4196008,.,Distal Intergenic,Gm29038,-,unprocessed_pseudogene,ENSMUSG00000101108.1,ENSMUST00000191543.1,,...,,"miR-7060 (1), miR-7658 (1)",,miR-337 (1),miR-378a (1),,,,"LINE@L1_Mus3(+), LTR@IAPLTR3(+)",LINE/LTR
77705,z-dna,chrY:4202845-4202862,.,Distal Intergenic,Gm28191,+,unprocessed_pseudogene,ENSMUSG00000100300.1,ENSMUST00000189112.1,,...,,,,,miR-26a/miR-26b (1),miR-547 (1),,,"LTR@IAPEY3-int(-), LTR@IAPEY_LTR(-)",LINE/LTR
77706,z-dna,chrY:4203100-4203146,.,Distal Intergenic,Gm28191,+,unprocessed_pseudogene,ENSMUSG00000100300.1,ENSMUST00000189112.1,,...,,"miR-466d/miR-466i (4), miR-466l (5)",,miR-493 (1),,,,,LINE@L1_Mur1(-),LINE/LTR


In [12]:
# conserved miR only
# (
#     flipon_to_data
#     .replace("", None)
#     .dropna(subset=["c. miR (+)", "c. miR (-)"], how="all")
#     .fillna("")
#     .to_csv(F_FLIPON_TO_DATA_MIRNA, sep="\t", index=False)
# )

flipon_to_data.to_csv(F_FLIPON_TO_DATA, sep='\t', index=False)