## Misc

In [1]:
!python3 --version
!pip list | grep --color=auto bio

Python 3.10.6
[01;31m[Kbio[m[Kpython             1.79


In [2]:
import os
import subprocess
import numpy as np
import pandas as pd

from Bio import SeqIO
from dataclasses import dataclass
from tqdm.auto import tqdm, trange
from collections import Counter


In [11]:
d_root = "/home/fed/GitHub/article_conserved_miRNA/"
d_data = f"{d_root}/data"
d_utils = f"{d_root}/utils"

# Input
f_mirna_list = f"{d_data}/miRNA_list.tsv"
f_flipon_to_gene_anno = f"{d_data}/flipon_to_gene_anno.tsv"
f_sidd_to_gene = f"{d_data}/sidd_to_gene.tsv"
f_ccre = f"{d_data}/encodeCcreCombined.bed"
f_rmsk = f"{d_data}/mm10_rmsk.txt.gz"
f_rmsk_line_ltr = f"{d_data}/mm10_rmsk_line_ltr.bed"
f_h3k9ac = f"{d_data}/GSM775313_H3K9Ac_s_3_sorted_unique.uplift.bed.gz"
f_h3k14ac = f"{d_data}/GSM775314_H3K14Ac_13HH3_s_4_sorted.uplift.bed.gz"

# Output
f_ccre_small = f"{d_data}/ccre_small.bed"
f_flipon_to_data = f"{d_data}/flipon_to_data.tsv"
f_plotly_ccre_vs_repeats_3 = f"{d_root}/img/plotl_ccre_vs_repeats_3.html"


In [4]:
@dataclass
class Data:
    name: str
    path_fa: str
    path_bed: str
    path_bed_200: str
    path_int_ccre: str = None
    path_int_repeats: str = None


flipon_data = [
    Data(
        "sidd",
        f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.fa",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_sidd.bed",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_sidd.slop200.bed",
    ),
    Data(
        "z-dna",
        f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna.fa",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna.bed",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna.slop200.bed",
    ),
    Data(
        "quadruplex",
        f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex.fa",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex.bed",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex.slop200.bed",
    ),
    Data(
        "h-dna",
        f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna.fa",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_h-dna.bed",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_h-dna.slop200.bed",
    ),
]


## Data

In [5]:
def reverse_complement(x: str):
    t_d = {"A": "T", "C": "G", "T": "A", "G": "C", "N": "N"}
    return "".join([t_d[y] for y in x[::-1]])


mirna_df = pd.read_table(f_mirna_list)

mirna_df["Seed region"] = mirna_df["Seed region"].str.replace("U", "T")
mirna_df["Seed region"] = mirna_df["Seed region"].str.split(", ")
mirna_df["Seed region (RC)"] = mirna_df["Seed region"].apply(
    lambda x: [reverse_complement(y) for y in x]
)

print(mirna_df.shape)
mirna_df.iloc[10:20, :]


(177, 5)


Unnamed: 0,Conservation,miRNA gene family,# of genes,Seed region,Seed region (RC)
10,Bilateria,miR-34/449,6,[GGCAGTG],[CACTGCC]
11,Bilateria,miR-96/1271,2,[TTGGCAC],[GTGCCAA]
12,Bilateria,miR-99/100,3,[ACCCGTA],[TACGGGT]
13,Bilateria,miR-124,3,"[AAGGCAC, TAAGGCA]","[GTGCCTT, TGCCTTA]"
14,Bilateria,miR-125,3,[CCCTGAG],[CTCAGGG]
15,Bilateria,miR-133,3,"[TGGTCCC, TTGGTCC]","[GGGACCA, GGACCAA]"
16,Bilateria,miR-153,2,[TGCATAG],[CTATGCA]
17,Bilateria,miR-183,1,"[ATGGCAC, TGGCACT]","[GTGCCAT, AGTGCCA]"
18,Bilateria,miR-184,1,[GGACGGA],[TCCGTCC]
19,Bilateria,miR-190,2,[GATATGT],[ACATATC]


## Map flipons to: miRNA

In [6]:
flipons_to_mirna_df = pd.DataFrame()

for flipon in tqdm(flipon_data):
    shape: int
    with open(flipon.path_bed, "r") as f_in:
        shape = sum(1 for _ in f_in)
    print(f"{flipon.name}: {shape:,d} regions")

    fasta = list(SeqIO.parse(flipon.path_fa, "fasta"))
    fasta_fwd_seq = [x.seq.upper() for x in fasta]

    mirna_fams = mirna_df["miRNA gene family"]
    mirna_motiffs = mirna_df["Seed region"]
    mirna_rcm_motiffs = mirna_df["Seed region (RC)"]

    mirna_list_fwd = [{} for _ in range(shape)]
    mirna_list_rcm = [{} for _ in range(shape)]

    for j, rec_fwd in enumerate(tqdm(fasta_fwd_seq, total=shape, leave=False)):
        counter_fwd = Counter()
        counter_rcm = Counter()
        for i, mirna_family in enumerate(mirna_fams):
            for motiff_fwd, motiff_rcm in zip(mirna_motiffs[i], mirna_rcm_motiffs[i]):
                counter_fwd[mirna_family] += rec_fwd.count(motiff_fwd)
                counter_rcm[mirna_family] += rec_fwd.count(motiff_rcm)
        mirna_list_fwd[j] = +counter_fwd
        mirna_list_rcm[j] = +counter_rcm

    df = pd.DataFrame(
        {
            "group": flipon.name,
            "coordinates": [x.id for x in fasta],
            "miRNA (+ strand)": mirna_list_fwd,
            "miRNA (- strand)": mirna_list_rcm,
        }
    )

    df["miRNA (intersection)"] = df[["miRNA (+ strand)", "miRNA (- strand)"]].apply(
        lambda x: ", ".join(sorted(set(x[0]) & set(x[1]))), axis=1
    )
    df["miRNA (+ strand)"] = df["miRNA (+ strand)"].apply(
        lambda x: ", ".join(
            sorted(
                [f"{y[0]} ({y[1]:,d})" if y[1] != 1 else f"{y[0]}" for y in x.items()]
            )
        )
    )
    df["miRNA (- strand)"] = df["miRNA (- strand)"].apply(
        lambda x: ", ".join(
            sorted(
                [f"{y[0]} ({y[1]:,d})" if y[1] != 1 else f"{y[0]}" for y in x.items()]
            )
        )
    )

    flipons_to_mirna_df = pd.concat(
        [flipons_to_mirna_df, df.sort_values("coordinates")], ignore_index=True
    )

# flipons_to_mirna_df.to_csv('flipons_to_mirna.tsv', sep='\t', quoting=2)
flipons_to_mirna_df


  0%|          | 0/4 [00:00<?, ?it/s]

sidd: 15,294 regions


  0%|          | 0/15294 [00:00<?, ?it/s]

z-dna: 25,057 regions


  0%|          | 0/25057 [00:00<?, ?it/s]

quadruplex: 20,251 regions


  0%|          | 0/20251 [00:00<?, ?it/s]

h-dna: 17,098 regions


  0%|          | 0/17098 [00:00<?, ?it/s]

Unnamed: 0,group,coordinates,miRNA (+ strand),miRNA (- strand),miRNA (intersection)
0,sidd,chr10:100119606-100119743,"miR-539, miR-670","miR-143, miR-186, miR-30",
1,sidd,chr10:100146768-100146986,"miR-155, miR-374",miR-448,
2,sidd,chr10:100160840-100160994,"miR-188, miR-204/211","miR-203a, miR-653",
3,sidd,chr10:100192250-100192378,"miR-15/16/195/424/497, miR-203a (2)",,
4,sidd,chr10:100401845-100402004,,"miR-101, miR-140",
...,...,...,...,...,...
77695,h-dna,chrY:4200098-4200122,,,
77696,h-dna,chrY:4208594-4208610,,,
77697,h-dna,chrY:4223501-4223522,,,
77698,h-dna,chrY:4231086-4231113,,,


## Map flipons to: gene features

In [13]:
def remove_gene_ids_from_annotation(s: str):
    if "Intron" in s or "Exon" in s:
        return s.split(" ")[0].strip()
    return s


flipons_to_genes_df = (
    pd.read_table(f_flipon_to_gene_anno)
    .assign(
        coordinates=lambda x: x["seqnames"]
        + ":"
        + (x["start"] - 1).astype(str)
        + "-"
        + x["end"].astype(str),
        geneStrand=lambda x: x["geneStrand"].apply(lambda y: "+" if y == 1 else "-"),
        gene_info=lambda x: x["gene_name"] + " (" + x["geneStrand"] + ")",
        annotation=lambda x: x["annotation"].apply(remove_gene_ids_from_annotation),
    )
    .loc[:, ["group", "coordinates", "annotation", "gene_info"]]
)

flipons_to_genes_df


Unnamed: 0,group,coordinates,annotation,gene_info
0,sidd,chr7:142578468-142578571,Promoter (<=1kb),H19 (-)
1,sidd,chrX:161216708-161216853,Intron,Scml2 (+)
2,sidd,chr11:95573504-95573647,Intron,Ngfr (-)
3,sidd,chr11:59307372-59307527,Promoter (<=1kb),Wnt9a (+)
4,sidd,chr17:63792985-63793092,Distal Intergenic,Fer (+)
...,...,...,...,...
39242,h-dna,chr13:21359449-21359470,Distal Intergenic,BX001068.1 (+)
39243,h-dna,chr4:96049628-96049650,Intron,AL772212.1 (-)
39244,h-dna,chr7:27899641-27899686,Distal Intergenic,AC139063.1 (-)
39245,h-dna,chr17:39848378-39848395,Promoter (<=1kb),CT010467.2 (-)


## Map flipons to: cCRE

In [14]:
pd.read_table(f_ccre, header=None)[9].value_counts()


dELS                        182982
pELS                         57202
dELS,CTCF-bound              28203
CTCF-only,CTCF-bound         24072
pELS,CTCF-bound              16620
PLS                          14062
PLS,CTCF-bound               10052
DNase-H3K4me3                 7095
DNase-H3K4me3,CTCF-bound      3443
Name: 9, dtype: int64

In [15]:
ccre_small_df = (
    pd.read_table(f_ccre, header=None)
    .assign(
        name_preformatted=lambda x: x[9] + "@" + x[3],
        name=lambda x: x["name_preformatted"].str.replace(
            ",CTCF-bound", " (CTCF-bound)"
        ),
    )
    .loc[:, [0, 1, 2, "name"]]
)

ccre_small_df.to_csv(f_ccre_small, sep="\t", header=False, index=False)
ccre_small_df


Unnamed: 0,0,1,2,name
0,chr1,3119617,3119911,dELS@EM10E0431203
1,chr1,3119914,3120120,dELS@EM10E0431204
2,chr1,3120346,3120662,dELS@EM10E0431205
3,chr1,3292622,3292971,dELS@EM10E0431207
4,chr1,3322453,3322797,dELS@EM10E0431208
...,...,...,...,...
343726,chrY,90729231,90729435,CTCF-only (CTCF-bound)@EM10E0932204
343727,chrY,90732178,90732526,CTCF-only (CTCF-bound)@EM10E0932207
343728,chrY,90734379,90734726,CTCF-only (CTCF-bound)@EM10E0932208
343729,chrY,90744476,90744639,DNase-H3K4me3 (CTCF-bound)@EM10E0932222


In [16]:
for flipon in flipon_data:
    flipon.path_int_ccre = flipon.path_bed_200.replace('.bed', '_and_ccre.bed')
    !bedtools intersect -a {flipon.path_bed_200} -b {f_ccre_small} -wo > {flipon.path_int_ccre}
    !wc -l {flipon.path_int_ccre}

1683 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_sidd.slop200_and_ccre.bed
20638 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna.slop200_and_ccre.bed
14637 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex.slop200_and_ccre.bed
3756 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_h-dna.slop200_and_ccre.bed


In [17]:
flipons_to_ccre_df = pd.DataFrame()
columns = [
    "sidd_chr",
    "sidd_start",
    "sidd_end",
    "coordinates",
    "ccre_chr",
    "ccre_start",
    "ccre_end",
    "ccre",
    "int_len",
]

for flipon in flipon_data:
    df = (
        pd.read_table(flipon.path_int_ccre, header=None, names=columns)
        .assign(
            group=flipon.name,
            ccre_join=lambda x: x.groupby("coordinates")["ccre"].transform(
                lambda x: ", ".join(x)
            ),
        )
        .drop_duplicates(subset=["coordinates"])
        .loc[:, ["group", "coordinates", "ccre_join"]]
        .rename(columns={"ccre_join": "cCRE (+-200bp slop)"})
    )

    flipons_to_ccre_df = pd.concat([flipons_to_ccre_df, df], ignore_index=True)

flipons_to_ccre_df


Unnamed: 0,group,coordinates,cCRE (+-200bp slop)
0,sidd,chr1:11330535-11330712,"dELS@EM10E0431890, dELS@EM10E0431891"
1,sidd,chr1:16171838-16172011,dELS@EM10E0432450
2,sidd,chr1:20743638-20743760,DNase-H3K4me3@EM10E0432741
3,sidd,chr1:22532979-22533089,dELS (CTCF-bound)@EM10E0432966
4,sidd,chr1:23924439-23924546,"pELS@EM10E0433139, dELS@EM10E0433140"
...,...,...,...
24415,h-dna,chrX:152769666-152769687,pELS (CTCF-bound)@EM10E0930778
24416,h-dna,chrX:159987892-159987912,DNase-H3K4me3 (CTCF-bound)@EM10E0931180
24417,h-dna,chrX:161717977-161717993,"PLS@EM10E0931282, pELS@EM10E0931283"
24418,h-dna,chrX:162643118-162643148,"pELS@EM10E0931392, pELS@EM10E0931393, pELS (CT..."


## Map flipons to: repeats

In [18]:
columns = [
    "bin",
    "swScore",
    "milliDiv",
    "milliDel",
    "milliIns",
    "genoName",
    "genoStart",
    "genoEnd",
    "genoLeft",
    "strand",
    "repName",
    "repClass",
    "repFamily",
    "repStart",
    "repEnd",
    "repLeft",
    "id",
]
mm10_rmsk = (
    pd.read_table(f_rmsk, header=None, names=columns, compression="gzip")
    .query('repClass.str.contains("LTR") or repClass.str.contains("LINE")')
    .assign(
        name=lambda x: x["repClass"] + "@" + x["repName"] + " (" + x["strand"] + ")"
    )
    .loc[:, ["genoName", "genoStart", "genoEnd", "name", "swScore", "strand"]]
    .reset_index(drop=True)
)

mm10_rmsk.to_csv(f_rmsk_line_ltr, sep="\t", header=None, index=None)
display(mm10_rmsk)


Unnamed: 0,genoName,genoStart,genoEnd,name,swScore,strand
0,chr1,3000000,3002128,LINE@L1_Mus3 (-),12955,-
1,chr1,3003152,3003994,LINE@L1Md_F (-),1216,-
2,chr1,3003993,3004054,LINE@L1_Mus3 (-),234,-
3,chr1,3004040,3004206,LINE@L1_Rod (+),3685,+
4,chr1,3004270,3005001,LINE@L1_Rod (+),3685,+
...,...,...,...,...,...,...
2023577,chrna_GL456050_alt,139992,140094,LTR@RMER19B (-),566,-
2023578,chrna_GL456050_alt,140098,140367,LTR@RMER16-int (+),8702,+
2023579,chrna_GL456050_alt,140409,140631,LTR@RMER16-int (+),8702,+
2023580,chrna_GL456050_alt,140631,141216,LTR@LTRIS_Mus (-),5082,-


In [19]:
for flipon in flipon_data:
    flipon.path_int_repeats = flipon.path_bed_200.replace('.bed', '_and_repeats.bed')
    !bedtools intersect -a {flipon.path_bed_200} -b {f_rmsk_line_ltr} -wo > {flipon.path_int_repeats}
    !wc -l {flipon.path_int_repeats}

23473 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_sidd.slop200_and_repeats.bed
11832 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna.slop200_and_repeats.bed
13332 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex.slop200_and_repeats.bed
15869 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_h-dna.slop200_and_repeats.bed


In [20]:
flipons_to_repeats_df = pd.DataFrame()
columns = [
    "sidd_chr",
    "sidd_start",
    "sidd_end",
    "coordinates",
    "rep_chr",
    "rep_start",
    "rep_end",
    "rep_name",
    "rep_score",
    "rep_strand",
    "int_len",
]

for flipon in flipon_data:
    df = (
        pd.read_table(flipon.path_int_repeats, header=None, names=columns)
        .assign(
            group=flipon.name,
            repeats_join=lambda x: x.groupby("coordinates")["rep_name"].transform(
                lambda x: ", ".join(x)
            ),
        )
        .drop_duplicates(subset=["coordinates"])
        .loc[:, ["group", "coordinates", "repeats_join"]]
        .rename(columns={"repeats_join": "LINE/LTR repeats (+-200bp slop)"})
    )

    flipons_to_repeats_df = pd.concat([flipons_to_repeats_df, df], ignore_index=True)

flipons_to_repeats_df


Unnamed: 0,group,coordinates,LINE/LTR repeats (+-200bp slop)
0,sidd,chr1:3235931-3236054,"LINE@L1Md_F2 (+), LINE@L1Md_F2 (+)"
1,sidd,chr1:3350891-3351054,"LINE@L1Md_T (+), LINE@Lx3B (-)"
2,sidd,chr1:3473131-3473291,LINE@L1_Mus1 (+)
3,sidd,chr1:3490742-3490885,"LINE@Lx3B (-), LINE@Lx2A1 (-)"
4,sidd,chr1:3566652-3566892,"LINE@Lx7 (+), LTR@RMER17A (+)"
...,...,...,...
39768,h-dna,chrY:4223501-4223522,LINE@L1_Mus2 (-)
39769,h-dna,chrY:4208594-4208610,"LTR@ORR1E (+), LINE@L1Md_T (-)"
39770,h-dna,chrY:4200098-4200122,LINE@L1MD (+)
39771,h-dna,chrY:4185636-4185659,"LTR@MMERGLN-int (-), LINE@L1Md_F2 (-)"


## Flipon mapping info: miRNA, gene features, cCREs, LINE/LTR repeats

In [21]:
flipons_to_data = (
    flipons_to_mirna_df.merge(
        flipons_to_genes_df, on=["group", "coordinates"], how="left"
    )
    .merge(flipons_to_ccre_df, on=["group", "coordinates"], how="left")
    .merge(flipons_to_repeats_df, on=["group", "coordinates"], how="left")
    .replace("", np.nan)
    .dropna(subset=["miRNA (+ strand)", "miRNA (- strand)"], how="all")
    .fillna("")
    .reset_index(drop=True)
)

flipons_to_data.to_csv(f_flipon_to_data, sep="\t", index=False)
flipons_to_data


Unnamed: 0,group,coordinates,miRNA (+ strand),miRNA (- strand),miRNA (intersection),annotation,gene_info,cCRE (+-200bp slop),LINE/LTR repeats (+-200bp slop)
0,sidd,chr10:100119606-100119743,"miR-539, miR-670","miR-143, miR-186, miR-30",,Distal Intergenic,Gm22918 (+),,"LTR@RLTR10C (+), LINE@Lx8b (-), LTR@MTB (+)"
1,sidd,chr10:100146768-100146986,"miR-155, miR-374",miR-448,,Promoter (2-3kb),Gm25287 (+),,LTR@MTB_Mm (-)
2,sidd,chr10:100160840-100160994,"miR-188, miR-204/211","miR-203a, miR-653",,,,dELS@EM10E0487086,"LINE@L1_Mus3 (-), LINE@L1_Mus3 (-)"
3,sidd,chr10:100192250-100192378,"miR-15/16/195/424/497, miR-203a (2)",,,Distal Intergenic,Gm47627 (-),,LINE@L1_Mus3 (+)
4,sidd,chr10:100401845-100402004,,"miR-101, miR-140",,Distal Intergenic,Gm4781 (-),,LINE@L1_Mus3 (-)
...,...,...,...,...,...,...,...,...,...
31681,h-dna,chrY:3773406-3773483,miR-339 (2),,,Promoter (1-2kb),Gm3376 (+),,
31682,h-dna,chrY:3773610-3773649,miR-339,,,,,,
31683,h-dna,chrY:3773694-3773741,,miR-185,,,,,
31684,h-dna,chrY:3865190-3865212,miR-122,,,,,,


## Task 1

The counts for the other flipons that are associated with the CCRE positive promoters (with no repeats) and those with the repeat promoters (and noCCRE). Is there any difference?


In [7]:
def create_dataframe(mirna_count: np.array, name: str):
    """Transform count matrix into a dataframe."""

    d = make_d(mirna_count, name)
    cur_df = pd.DataFrame([d], index=[name])
    cur_df["total"] = cur_df.sum(axis=1)
    return cur_df


def make_d(mirna_count: np.array, name: str):
    """Calculate values from a given count matrix to compute defined miRNA groupings."""
    temp_d = dict()
    df_count_temp = pd.DataFrame(mirna_count)
    overall_sum = (mirna_count).sum(axis=0)

    temp_d["no_mirna"] = (overall_sum == 0).sum()
    temp_d["single_mirna"] = (overall_sum == 1).sum()
    temp_d["single_but_multiple_times"] = (
        (df_count_temp.nunique() == 2) * (overall_sum > 1)
    ).sum()
    temp_d["multiple_types"] = (df_count_temp.nunique() > 2).sum()

    return temp_d


In [10]:
df = pd.DataFrame()

paths = [
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.slop100.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.slop200.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.slop500.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.slop100.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.slop200.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.slop500.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.slop100.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.slop200.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.slop500.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd.slop100.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd.slop200.fa",
    f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd.slop500.fa",
]

for path in tqdm(paths):
    name = path.split('enriched_')[1].split('.fa')[0]
    # print(name, path)

    shape: int
    with open(path, "r") as f_in:
        shape = sum(1 for _ in f_in) // 2
    # print(f"{name}: {shape:,d} regions")

    fasta = list(SeqIO.parse(path, "fasta"))        
    fasta_fwd_seq = [x.seq.upper() for x in fasta]


    mirna_fams = mirna_df["miRNA gene family"]
    mirna_motiffs = mirna_df["Seed region"]
    mirna_rcm_motiffs = mirna_df["Seed region (RC)"]

    mirna_count_fwd = np.zeros((len(mirna_fams), shape))
    mirna_count_rcm = np.zeros((len(mirna_fams), shape))

    for j, rec_fwd in enumerate(tqdm(fasta_fwd_seq, total=shape, leave=False)):
        for i, _ in enumerate(mirna_fams):
            mirna_count_fwd[i][j] = sum(
                rec_fwd.count(motiff) for motiff in mirna_motiffs[i]
            )
            mirna_count_rcm[i][j] = sum(
                rec_fwd.count(motiff_rev_comp)
                for motiff_rev_comp in mirna_rcm_motiffs[i]
            )

    df = pd.concat(
        [
            df,
            pd.concat(
                [
                    create_dataframe(mirna_count_fwd, name + f"_F"),
                    create_dataframe(mirna_count_rcm, name + f"_RC"),
                ]
            ),
        ]
    )

df


  0%|          | 0/19 [00:00<?, ?it/s]

sidd /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.fa
sidd: 15,294 regions


  0%|          | 0/15294 [00:00<?, ?it/s]

sidd.slop100 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.slop100.fa
sidd.slop100: 15,294 regions


  0%|          | 0/15294 [00:00<?, ?it/s]

sidd.slop200 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.slop200.fa
sidd.slop200: 15,294 regions


  0%|          | 0/15294 [00:00<?, ?it/s]

sidd.slop500 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.slop500.fa
sidd.slop500: 15,294 regions


  0%|          | 0/15294 [00:00<?, ?it/s]

z-dna /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna.fa
z-dna: 25,057 regions


  0%|          | 0/25057 [00:00<?, ?it/s]

z-dna_and_sidd /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.fa
z-dna_and_sidd: 256 regions


  0%|          | 0/256 [00:00<?, ?it/s]

z-dna_and_sidd.slop100 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.slop100.fa
z-dna_and_sidd.slop100: 757 regions


  0%|          | 0/757 [00:00<?, ?it/s]

z-dna_and_sidd.slop200 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.slop200.fa
z-dna_and_sidd.slop200: 1,116 regions


  0%|          | 0/1116 [00:00<?, ?it/s]

z-dna_and_sidd.slop500 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.slop500.fa
z-dna_and_sidd.slop500: 1,801 regions


  0%|          | 0/1801 [00:00<?, ?it/s]

quadruplex /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex.fa
quadruplex: 20,251 regions


  0%|          | 0/20251 [00:00<?, ?it/s]

quadruplex_and_sidd /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.fa
quadruplex_and_sidd: 1,374 regions


  0%|          | 0/1374 [00:00<?, ?it/s]

quadruplex_and_sidd.slop100 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.slop100.fa
quadruplex_and_sidd.slop100: 2,250 regions


  0%|          | 0/2250 [00:00<?, ?it/s]

quadruplex_and_sidd.slop200 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.slop200.fa
quadruplex_and_sidd.slop200: 2,534 regions


  0%|          | 0/2534 [00:00<?, ?it/s]

quadruplex_and_sidd.slop500 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.slop500.fa
quadruplex_and_sidd.slop500: 3,089 regions


  0%|          | 0/3089 [00:00<?, ?it/s]

h-dna /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna.fa
h-dna: 17,098 regions


  0%|          | 0/17098 [00:00<?, ?it/s]

h-dna_and_sidd /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd.fa
h-dna_and_sidd: 1,627 regions


  0%|          | 0/1627 [00:00<?, ?it/s]

h-dna_and_sidd.slop100 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd.slop100.fa
h-dna_and_sidd.slop100: 2,016 regions


  0%|          | 0/2016 [00:00<?, ?it/s]

h-dna_and_sidd.slop200 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd.slop200.fa
h-dna_and_sidd.slop200: 2,249 regions


  0%|          | 0/2249 [00:00<?, ?it/s]

h-dna_and_sidd.slop500 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd.slop500.fa
h-dna_and_sidd.slop500: 2,560 regions


  0%|          | 0/2560 [00:00<?, ?it/s]

Unnamed: 0,no_mirna,single_mirna,single_but_multiple_times,multiple_types,total
sidd_F,3153,4117,6271,1753,15294
sidd_RC,3048,4158,6273,1815,15294
sidd.slop100_F,135,520,10352,4287,15294
sidd.slop100_RC,130,557,10246,4361,15294
sidd.slop200_F,10,35,8760,6489,15294
sidd.slop200_RC,6,29,8705,6554,15294
sidd.slop500_F,1,0,2976,12317,15294
sidd.slop500_RC,0,0,2896,12398,15294
z-dna_F,21666,2616,730,45,25057
z-dna_RC,21709,2731,511,106,25057


## Task 2

The  overlap with DNA methylation at CCRE positive elements (but no repeats)  at all genomic locations and the same for repeats (with no CCRE overlap). Are repeat promoters preferentially methylated?


### Map flipons to: H3K9ac and H3K14ac

In [12]:
for flipon in flipon_data:
    !gzip -cd {f_h3k9ac} | bedtools intersect -a {flipon.path_bed} -b stdin -wo | head
    !gzip -cd {f_h3k14ac} | bedtools intersect -a {flipon.path_bed} -b stdin -wo | head
    break

## Task 3

Are the miRNAs that bind repeat promoters (no overlap with CCRE) the same or different from the CCRE promoters (no overlap with repeats)?

1. Pick all **promoter regions** with **repeats** but **no cCREs** and make a subset of miRNAs
2. Pick all **promoter regions** with **cCREs** but **no repeats** and make a subset of miRNAs
3. Make a barplot

In [161]:
form_df_1 = (
    flipons_to_data.query(
        'annotation.str.contains("Promoter") and \
        `cCRE (+-200bp slop)` == "" and \
        `LINE/LTR repeats (+-200bp slop)` != ""'
    )
    .replace(regex=" \(\d+\)", value="")
    .assign(
        type="Repeats without cCREs",
        mirna_p=lambda x: x["miRNA (+ strand)"].apply(lambda x: x.split(", ")),
        mirna_m=lambda x: x["miRNA (- strand)"].apply(lambda x: x.split(", ")),
    )
    .explode("mirna_p")
    .explode("mirna_m")
    .loc[:, ["type", "annotation", "group", "mirna_p", "mirna_m"]]
)

form_df_2 = (
    flipons_to_data.query(
        'annotation.str.contains("Promoter") and \
        `cCRE (+-200bp slop)` != "" and \
        `LINE/LTR repeats (+-200bp slop)` == ""'
    )
    .replace(regex=" \(\d+\)", value="")
    .assign(
        type="cCREs without Repeats",
        mirna_p=lambda x: x["miRNA (+ strand)"].apply(lambda x: x.split(", ")),
        mirna_m=lambda x: x["miRNA (- strand)"].apply(lambda x: x.split(", ")),
    )
    .explode("mirna_p")
    .explode("mirna_m")
    .loc[:, ["type", "annotation", "group", "mirna_p", "mirna_m"]]
)

form_df = (
    pd.concat([form_df_1, form_df_2], ignore_index=True)
    .melt(
        id_vars=["type", "annotation", "group"],
        value_vars=["mirna_p", "mirna_m"],
        var_name='strand',
        value_name="mirna",
    )
    .groupby(["mirna", "type"])["group"]
    .value_counts()
    .to_frame()
    .rename(columns={"group": "total_count"})
    .reset_index()
    .replace('', np.nan).dropna()
    .sort_values('total_count', ascending=False)
)
form_df


Unnamed: 0,mirna,type,group,total_count
238,miR-203a,Repeats without cCREs,sidd,637
575,miR-744,cCREs without Repeats,quadruplex,315
198,miR-186,Repeats without cCREs,sidd,226
398,miR-374,Repeats without cCREs,sidd,199
566,miR-670,Repeats without cCREs,sidd,158
...,...,...,...,...
128,miR-145,cCREs without Repeats,h-dna,1
471,miR-485,Repeats without cCREs,quadruplex,1
133,miR-147,Repeats without cCREs,sidd,1
249,miR-205,cCREs without Repeats,h-dna,1


In [159]:
import plotly.express as px

fig = px.bar(
    form_df,
    x="mirna",
    y="total_count",
    color="type",
    barmode="group",
    facet_row="group",
    # facet_col="value",
    # category_orders={"day": ["Thur", "Fri", "Sat", "Sun"], "time": ["Lunch", "Dinner"]},
    # color_discrete_map={'strand_p':'tomato', 'strand_m':'royalblue'},
)
fig.update_layout(
    title="miRNAs that bind <b>repeat promoters (no overlap with CCRE)</b> vs <b>the CCRE promoters (no overlap with repeats)</b>",
    height=800,
    width=1400,
)
fig.write_html(f_plotly_ccre_vs_repeats_3)
fig.show()
