## Misc

In [1]:
!python3 --version
!python3 -m pip list | grep -e "bio"

Python 3.11.0
biopython                     1.79


In [1]:
import os
import subprocess
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as pgo
import plotly.subplots as ps

from Bio import SeqIO
from dataclasses import dataclass
from tqdm.notebook import tqdm
from collections import Counter


In [2]:
d_root = "/home/fpavlov/projects/article_conserved_miRNA"
d_data = f"{d_root}/data"
d_utils = f"{d_root}/utils"

# Input
f_mirna_list = f"{d_data}/miRNA_list.tsv"
f_flipon_to_gene_anno = f"{d_data}/flipon_to_gene_anno.tsv"
f_sidd_to_gene = f"{d_data}/sidd_to_gene.tsv"
f_ccre = f"{d_data}/encodeCcreCombined.bed"
f_rmsk = f"{d_data}/mm10.rmsk.txt.gz"
f_rmsk_line_ltr = f"{d_data}/mm10_rmsk_line_ltr.bed"
f_h3k9ac = f"{d_data}/mm10.H3K9ac.uplift.bed"
f_h3k14ac = f"{d_data}/mm10.H3K14ac.uplift.bed"
f_mm10_fa = f'{d_data}/mm10.fa'

# Output
d_flipons_and_histones = f"{d_data}/flipons_and_histones"
f_ccre_small = f"{d_data}/ccre_small.bed"
f_flipon_to_data = f"{d_data}/flipon_to_data.tsv"
f_flipon_to_data_og = f"{d_data}/flipon_to_data_og.tsv" 
f_plotly_ccre_vs_repeats_3 = f"{d_root}/img/plotl_ccre_vs_repeats_3.html"
f_plotly_ccre_vs_repeats_4 = f"{d_root}/img/plotl_ccre_vs_repeats_4.html"
f_enrichment_table_genes =f"{d_data}/enrichment_table_genes.tsv"

!mkdir -p {d_flipons_and_histones}


In [3]:
@dataclass
class Data:
    name: str
    path_fa: str
    path_bed: str
    path_bed_200: str
    path_int_ccre: str = None
    path_int_repeats: str = None


flipon_data = [
    Data(
        "sidd",
        f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.fa",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_sidd.bed",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_sidd.slop200.bed",
    ),
    Data(
        "z-dna",
        f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna.fa",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna.bed",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna.slop200.bed",
    ),
    Data(
        "quadruplex",
        f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex.fa",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex.bed",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex.slop200.bed",
    ),
    Data(
        "h-dna",
        f"{d_data}/mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna.fa",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_h-dna.bed",
        f"{d_data}/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_h-dna.slop200.bed",
    ),
]


## Data

In [4]:
def reverse_complement(x: str):
    t_d = {"A": "T", "C": "G", "T": "A", "G": "C", "N": "N"}
    return "".join([t_d[y] for y in x[::-1]])


mirna_df = pd.read_table(f_mirna_list)

mirna_df["Seed region"] = mirna_df["Seed region"].str.replace("U", "T")
mirna_df["Seed region"] = mirna_df["Seed region"].str.split(", ")
mirna_df["Seed region (RC)"] = mirna_df["Seed region"].apply(
    lambda x: [reverse_complement(y) for y in x]
)

print(mirna_df.shape)
mirna_df.iloc[10:20, :]


(177, 5)


Unnamed: 0,Conservation,miRNA gene family,# of genes,Seed region,Seed region (RC)
10,Bilateria,miR-34/449,6,[GGCAGTG],[CACTGCC]
11,Bilateria,miR-96/1271,2,[TTGGCAC],[GTGCCAA]
12,Bilateria,miR-99/100,3,[ACCCGTA],[TACGGGT]
13,Bilateria,miR-124,3,"[AAGGCAC, TAAGGCA]","[GTGCCTT, TGCCTTA]"
14,Bilateria,miR-125,3,[CCCTGAG],[CTCAGGG]
15,Bilateria,miR-133,3,"[TGGTCCC, TTGGTCC]","[GGGACCA, GGACCAA]"
16,Bilateria,miR-153,2,[TGCATAG],[CTATGCA]
17,Bilateria,miR-183,1,"[ATGGCAC, TGGCACT]","[GTGCCAT, AGTGCCA]"
18,Bilateria,miR-184,1,[GGACGGA],[TCCGTCC]
19,Bilateria,miR-190,2,[GATATGT],[ACATATC]


## Map flipons to: miRNA

In [5]:
flipons_to_mirna_df = pd.DataFrame()

for flipon in tqdm(flipon_data):
    shape: int
    with open(flipon.path_bed, "r") as f_in:
        shape = sum(1 for _ in f_in)
    print(f"{flipon.name}: {shape:,d} regions")

    fasta = list(SeqIO.parse(flipon.path_fa, "fasta"))
    fasta_fwd_seq = [x.seq.upper() for x in fasta]

    mirna_fams = mirna_df["miRNA gene family"]
    mirna_motiffs = mirna_df["Seed region"]
    mirna_rcm_motiffs = mirna_df["Seed region (RC)"]

    mirna_list_fwd = [{} for _ in range(shape)]
    mirna_list_rcm = [{} for _ in range(shape)]

    for j, rec_fwd in enumerate(tqdm(fasta_fwd_seq, total=shape, leave=False)):
        counter_fwd = Counter()
        counter_rcm = Counter()
        for i, mirna_family in enumerate(mirna_fams):
            for motiff_fwd, motiff_rcm in zip(mirna_motiffs[i], mirna_rcm_motiffs[i]):
                counter_fwd[mirna_family] += rec_fwd.count(motiff_fwd)
                counter_rcm[mirna_family] += rec_fwd.count(motiff_rcm)
        mirna_list_fwd[j] = +counter_fwd
        mirna_list_rcm[j] = +counter_rcm

    df = pd.DataFrame(
        {
            "group": flipon.name,
            "coordinates": [x.id for x in fasta],
            "miRNA (+ strand)": mirna_list_fwd,
            "miRNA (- strand)": mirna_list_rcm,
        }
    )

    df["miRNA (intersection)"] = df[["miRNA (+ strand)", "miRNA (- strand)"]].apply(
        lambda x: ", ".join(sorted(set(x[0]) & set(x[1]))), axis=1
    )
    df["miRNA (+ strand)"] = df["miRNA (+ strand)"].apply(
        lambda x: ", ".join(
            sorted(
                [f"{y[0]} ({y[1]:,d})" if y[1] != 1 else f"{y[0]}" for y in x.items()]
            )
        )
    )
    df["miRNA (- strand)"] = df["miRNA (- strand)"].apply(
        lambda x: ", ".join(
            sorted(
                [f"{y[0]} ({y[1]:,d})" if y[1] != 1 else f"{y[0]}" for y in x.items()]
            )
        )
    )

    flipons_to_mirna_df = pd.concat(
        [flipons_to_mirna_df, df.sort_values("coordinates")], ignore_index=True
    )

# flipons_to_mirna_df.to_csv('flipons_to_mirna.tsv', sep='\t', quoting=2)
flipons_to_mirna_df


  0%|          | 0/4 [00:00<?, ?it/s]

sidd: 15,294 regions


  0%|          | 0/15294 [00:00<?, ?it/s]

z-dna: 25,057 regions


  0%|          | 0/25057 [00:00<?, ?it/s]

quadruplex: 20,251 regions


  0%|          | 0/20251 [00:00<?, ?it/s]

h-dna: 17,098 regions


  0%|          | 0/17098 [00:00<?, ?it/s]

Unnamed: 0,group,coordinates,miRNA (+ strand),miRNA (- strand),miRNA (intersection)
0,sidd,chr10:100119606-100119743,"miR-539, miR-670","miR-143, miR-186, miR-30",
1,sidd,chr10:100146768-100146986,"miR-155, miR-374",miR-448,
2,sidd,chr10:100160840-100160994,"miR-188, miR-204/211","miR-203a, miR-653",
3,sidd,chr10:100192250-100192378,"miR-15/16/195/424/497, miR-203a (2)",,
4,sidd,chr10:100401845-100402004,,"miR-101, miR-140",
...,...,...,...,...,...
77695,h-dna,chrY:4200098-4200122,,,
77696,h-dna,chrY:4208594-4208610,,,
77697,h-dna,chrY:4223501-4223522,,,
77698,h-dna,chrY:4231086-4231113,,,


## Map flipons to: gene features

In [6]:
def remove_gene_ids_from_annotation(s: str):
    if "Intron" in s or "Exon" in s:
        return s.split(" ")[0].strip()
    return s


flipons_to_genes_df = (
    pd.read_table(f_flipon_to_gene_anno)
    .assign(
        coordinates=lambda x: x["V4"],
        geneStrand=lambda x: x["geneStrand"].apply(lambda y: "+" if y == 1 else "-"),
        gene_info=lambda x: x["gene_name"] + " (" + x["geneStrand"] + ")",
        annotation=lambda x: x["annotation"].apply(remove_gene_ids_from_annotation),
    )
    .loc[:, ["group", "coordinates", "annotation", "gene_info"]]
)

flipons_to_genes_df


Unnamed: 0,group,coordinates,annotation,gene_info
0,z-dna,chr7:142572121-142572208,Distal Intergenic,H19 (-)
1,z-dna,chr4:120414009-120414024,Intron,Scmh1 (+)
2,z-dna,chr11:85832528-85832543,Promoter (<=1kb),Tbx2 (+)
3,z-dna,chr11:85832843-85832860,Promoter (<=1kb),Tbx2 (+)
4,z-dna,chr11:85833014-85833035,Promoter (<=1kb),Tbx2 (+)
...,...,...,...,...
77695,h-dna,chr7:27899690-27899773,Distal Intergenic,AC139063.1 (-)
77696,h-dna,chr7:27899779-27899799,Distal Intergenic,AC139063.1 (-)
77697,h-dna,chr7:27899863-27899888,Distal Intergenic,AC139063.1 (-)
77698,h-dna,chr17:39848378-39848395,Promoter (<=1kb),CT010467.2 (-)


## Map flipons to: cCRE

In [7]:
pd.read_table(f_ccre, header=None)[9].value_counts()

dELS                        182982
pELS                         57202
dELS,CTCF-bound              28203
CTCF-only,CTCF-bound         24072
pELS,CTCF-bound              16620
PLS                          14062
PLS,CTCF-bound               10052
DNase-H3K4me3                 7095
DNase-H3K4me3,CTCF-bound      3443
Name: 9, dtype: int64

In [8]:
ccre_small_df = (
    pd.read_table(f_ccre, header=None)
    .assign(
        name_preformatted=lambda x: x[9] + "@" + x[3],
        name=lambda x: x["name_preformatted"].str.replace(
            ",CTCF-bound", " (CTCF-bound)"
        ),
    )
    .loc[:, [0, 1, 2, "name"]]
)

ccre_small_df.to_csv(f_ccre_small, sep="\t", header=False, index=False)
ccre_small_df


Unnamed: 0,0,1,2,name
0,chr1,3119617,3119911,dELS@EM10E0431203
1,chr1,3119914,3120120,dELS@EM10E0431204
2,chr1,3120346,3120662,dELS@EM10E0431205
3,chr1,3292622,3292971,dELS@EM10E0431207
4,chr1,3322453,3322797,dELS@EM10E0431208
...,...,...,...,...
343726,chrY,90729231,90729435,CTCF-only (CTCF-bound)@EM10E0932204
343727,chrY,90732178,90732526,CTCF-only (CTCF-bound)@EM10E0932207
343728,chrY,90734379,90734726,CTCF-only (CTCF-bound)@EM10E0932208
343729,chrY,90744476,90744639,DNase-H3K4me3 (CTCF-bound)@EM10E0932222


In [9]:
for flipon in flipon_data:
    flipon.path_int_ccre = flipon.path_bed_200.replace('.bed', '_and_ccre.bed')
    !bedtools intersect -a {flipon.path_bed_200} -b {f_ccre_small} -wo > {flipon.path_int_ccre}
    !wc -l {flipon.path_int_ccre}

1683 /home/fpavlov/projects/article_conserved_miRNA/data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_sidd.slop200_and_ccre.bed
20638 /home/fpavlov/projects/article_conserved_miRNA/data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna.slop200_and_ccre.bed
14637 /home/fpavlov/projects/article_conserved_miRNA/data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex.slop200_and_ccre.bed
3756 /home/fpavlov/projects/article_conserved_miRNA/data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_h-dna.slop200_and_ccre.bed


In [10]:
flipons_to_ccre_df = pd.DataFrame()
columns = [
    "sidd_chr",
    "sidd_start",
    "sidd_end",
    "coordinates",
    "ccre_chr",
    "ccre_start",
    "ccre_end",
    "ccre",
    "int_len",
]

for flipon in flipon_data:
    df = (
        pd.read_table(flipon.path_int_ccre, header=None, names=columns)
        .assign(
            group=flipon.name,
            ccre_join=lambda x: x.groupby("coordinates")["ccre"].transform(
                lambda x: ", ".join(x)
            ),
        )
        .drop_duplicates(subset=["coordinates"])
        .loc[:, ["group", "coordinates", "ccre_join"]]
        .rename(columns={"ccre_join": "cCRE (+-200bp slop)"})
    )

    flipons_to_ccre_df = pd.concat([flipons_to_ccre_df, df], ignore_index=True)

flipons_to_ccre_df


Unnamed: 0,group,coordinates,cCRE (+-200bp slop)
0,sidd,chr1:11330535-11330712,"dELS@EM10E0431890, dELS@EM10E0431891"
1,sidd,chr1:16171838-16172011,dELS@EM10E0432450
2,sidd,chr1:20743638-20743760,DNase-H3K4me3@EM10E0432741
3,sidd,chr1:22532979-22533089,dELS (CTCF-bound)@EM10E0432966
4,sidd,chr1:23924439-23924546,"pELS@EM10E0433139, dELS@EM10E0433140"
...,...,...,...
24415,h-dna,chrX:152769666-152769687,pELS (CTCF-bound)@EM10E0930778
24416,h-dna,chrX:159987892-159987912,DNase-H3K4me3 (CTCF-bound)@EM10E0931180
24417,h-dna,chrX:161717977-161717993,"PLS@EM10E0931282, pELS@EM10E0931283"
24418,h-dna,chrX:162643118-162643148,"pELS@EM10E0931392, pELS@EM10E0931393, pELS (CT..."


## Map flipons to: repeats

In [11]:
columns = [
    "bin",
    "swScore",
    "milliDiv",
    "milliDel",
    "milliIns",
    "genoName",
    "genoStart",
    "genoEnd",
    "genoLeft",
    "strand",
    "repName",
    "repClass",
    "repFamily",
    "repStart",
    "repEnd",
    "repLeft",
    "id",
]
mm10_rmsk = (
    pd.read_table(f_rmsk, header=None, names=columns, compression="gzip")
    .query('repClass.str.contains("LTR") or repClass.str.contains("LINE")')
    .assign(
        name=lambda x: x["repClass"] + "@" + x["repName"] + " (" + x["strand"] + ")"
    )
    .loc[:, ["genoName", "genoStart", "genoEnd", "name", "swScore", "strand"]]
    .reset_index(drop=True)
    .to_csv(f_rmsk_line_ltr, sep="\t", header=None, index=None)
)

!head {f_rmsk_line_ltr}


chr1	3000000	3002128	LINE@L1_Mus3 (-)	12955	-
chr1	3003152	3003994	LINE@L1Md_F (-)	1216	-
chr1	3003993	3004054	LINE@L1_Mus3 (-)	234	-
chr1	3004040	3004206	LINE@L1_Rod (+)	3685	+
chr1	3004270	3005001	LINE@L1_Rod (+)	3685	+
chr1	3005001	3005439	LINE@L1_Rod (+)	1280	+
chr1	3005460	3005548	LINE@Lx9 (+)	4853	+
chr1	3005570	3006764	LINE@Lx9 (+)	4853	+
chr1	3007014	3007268	LINE@L1M4 (-)	438	-
chr1	3008116	3008483	LINE@L1_Mur2 (-)	1590	-


In [12]:
for flipon in flipon_data:
    flipon.path_int_repeats = flipon.path_bed_200.replace('.bed', '_and_repeats.bed')
    !bedtools intersect -a {flipon.path_bed_200} -b {f_rmsk_line_ltr} -wo > {flipon.path_int_repeats}
    !wc -l {flipon.path_int_repeats}

23473 /home/fpavlov/projects/article_conserved_miRNA/data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_sidd.slop200_and_repeats.bed
11832 /home/fpavlov/projects/article_conserved_miRNA/data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna.slop200_and_repeats.bed
13332 /home/fpavlov/projects/article_conserved_miRNA/data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex.slop200_and_repeats.bed
15869 /home/fpavlov/projects/article_conserved_miRNA/data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_h-dna.slop200_and_repeats.bed


In [13]:
flipons_to_repeats_df = pd.DataFrame()
columns = [
    "sidd_chr",
    "sidd_start",
    "sidd_end",
    "coordinates",
    "rep_chr",
    "rep_start",
    "rep_end",
    "rep_name",
    "rep_score",
    "rep_strand",
    "int_len",
]

for flipon in flipon_data:
    df = (
        pd.read_table(flipon.path_int_repeats, header=None, names=columns)
        .assign(
            group=flipon.name,
            repeats_join=lambda x: x.groupby("coordinates")["rep_name"].transform(
                lambda x: ", ".join(x)
            ),
        )
        .drop_duplicates(subset=["coordinates"])
        .loc[:, ["group", "coordinates", "repeats_join"]]
        .rename(columns={"repeats_join": "LINE/LTR repeats (+-200bp slop)"})
    )

    flipons_to_repeats_df = pd.concat([flipons_to_repeats_df, df], ignore_index=True)

flipons_to_repeats_df


Unnamed: 0,group,coordinates,LINE/LTR repeats (+-200bp slop)
0,sidd,chr1:3235931-3236054,"LINE@L1Md_F2 (+), LINE@L1Md_F2 (+)"
1,sidd,chr1:3350891-3351054,"LINE@L1Md_T (+), LINE@Lx3B (-)"
2,sidd,chr1:3473131-3473291,LINE@L1_Mus1 (+)
3,sidd,chr1:3490742-3490885,"LINE@Lx3B (-), LINE@Lx2A1 (-)"
4,sidd,chr1:3566652-3566892,"LINE@Lx7 (+), LTR@RMER17A (+)"
...,...,...,...
39768,h-dna,chrY:4223501-4223522,LINE@L1_Mus2 (-)
39769,h-dna,chrY:4208594-4208610,"LTR@ORR1E (+), LINE@L1Md_T (-)"
39770,h-dna,chrY:4200098-4200122,LINE@L1MD (+)
39771,h-dna,chrY:4185636-4185659,"LTR@MMERGLN-int (-), LINE@L1Md_F2 (-)"


## Flipon mapping info: miRNA, gene features, cCREs, LINE/LTR repeats

In [14]:
flipons_to_data = (
    flipons_to_mirna_df.merge(
        flipons_to_genes_df, on=["group", "coordinates"], how="left"
    )
    .merge(flipons_to_ccre_df, on=["group", "coordinates"], how="left")
    .merge(flipons_to_repeats_df, on=["group", "coordinates"], how="left")
    .fillna("")
    .reset_index(drop=True)
)

# export only those regions which have at least 1 miRNA overlapping site
(
    flipons_to_data
    .replace("", np.nan)
    .dropna(subset=["miRNA (+ strand)", "miRNA (- strand)"], how="all")
    .fillna("")
    .to_csv(f_flipon_to_data, sep="\t", index=False)
)

flipons_to_data.to_csv(f_flipon_to_data_og, sep='\t', index=False)

flipons_to_data


Unnamed: 0,group,coordinates,miRNA (+ strand),miRNA (- strand),miRNA (intersection),annotation,gene_info,cCRE (+-200bp slop),LINE/LTR repeats (+-200bp slop)
0,sidd,chr10:100119606-100119743,"miR-539, miR-670","miR-143, miR-186, miR-30",,Distal Intergenic,Gm22918 (+),,"LTR@RLTR10C (+), LINE@Lx8b (-), LTR@MTB (+)"
1,sidd,chr10:100146768-100146986,"miR-155, miR-374",miR-448,,Promoter (2-3kb),Gm25287 (+),,LTR@MTB_Mm (-)
2,sidd,chr10:100160840-100160994,"miR-188, miR-204/211","miR-203a, miR-653",,Distal Intergenic,Gm25287 (+),dELS@EM10E0487086,"LINE@L1_Mus3 (-), LINE@L1_Mus3 (-)"
3,sidd,chr10:100192250-100192378,"miR-15/16/195/424/497, miR-203a (2)",,,Distal Intergenic,Gm47627 (-),,LINE@L1_Mus3 (+)
4,sidd,chr10:100401845-100402004,,"miR-101, miR-140",,Distal Intergenic,Gm4781 (-),,LINE@L1_Mus3 (-)
...,...,...,...,...,...,...,...,...,...
77695,h-dna,chrY:4200098-4200122,,,,Distal Intergenic,Gm28191 (+),,LINE@L1MD (+)
77696,h-dna,chrY:4208594-4208610,,,,Distal Intergenic,Gm28191 (+),,"LTR@ORR1E (+), LINE@L1Md_T (-)"
77697,h-dna,chrY:4223501-4223522,,,,Distal Intergenic,Gm28191 (+),,LINE@L1_Mus2 (-)
77698,h-dna,chrY:4231086-4231113,,,,Intron,Gm28191 (+),,LTR@MuRRS4-int (-)


In [15]:
flipons_to_data.group.value_counts().sort_index(ascending=False)

z-dna         25057
sidd          15294
quadruplex    20251
h-dna         17098
Name: group, dtype: int64

## Enrichment table

In [26]:
def get_group(x: list):
    ccres = ["dELS", "pELS", "PLS", "DNase-H3K4me3"]
    any_ccre = False
    for ccre in ccres:
        if ccre in x[0]:
            any_ccre = True
            break
    any_ctcf = "CTCF" in x[0]
    any_reps = x[1] != ""
    
    return ' & '.join([any_ccre * "cCRE", any_reps * "LINE/LTR", any_ctcf * "CTCF"]).replace('&  &','&').strip()


def get_rep_cats(x: str):
    return " + ".join(sorted(set([y.strip() for y in x.split() if y])))

with pd.option_context("display.max_columns", None, "display.max_rows", None):
    display(
        flipons_to_data.replace(regex=r'(@(.+?), )|(@(.+?)$)', value=' ').assign(
            ccre_col=lambda x: x["cCRE (+-200bp slop)"].apply(get_rep_cats),
            rep_col=lambda x: x["LINE/LTR repeats (+-200bp slop)"].apply(get_rep_cats),
            col=lambda x: (x["ccre_col"] + " + " + x["rep_col"]).str.strip(" +"),
            group=lambda x: x[['ccre_col', 'rep_col']].apply(get_group, axis=1).str.strip('& ')
        ).loc[:, ["annotation", "col", "group"]]
        .groupby('annotation')[['col', 'group']].value_counts().to_frame()
        .rename(columns={0: "count"}).reset_index()
        .pivot(index="annotation", columns=["group","col"], values="count")
        .sort_index(axis=1).T
        .fillna(0)
        .reset_index().groupby('group')[["3' UTR", "5' UTR", "Exon", "Intron", "Distal Intergenic", "Downstream (<=300bp)", "Promoter (<=1kb)", "Promoter (1-2kb)", "Promoter (2-3kb)"]].sum()
        .T
    )
    display(
        flipons_to_data.replace(regex=r'(@(.+?), )|(@(.+?)$)', value=' ').assign(
            ccre_col=lambda x: x["cCRE (+-200bp slop)"].apply(get_rep_cats),
            rep_col=lambda x: x["LINE/LTR repeats (+-200bp slop)"].apply(get_rep_cats),
            col=lambda x: (x["ccre_col"] + " + " + x["rep_col"]).str.strip(" +"),
            group=lambda x: x[['ccre_col', 'rep_col']].apply(get_group, axis=1).str.strip('& ')
        ).loc[:, ["annotation", "col", "group"]]
        .groupby('annotation')[['col', 'group']].value_counts().to_frame()
        .rename(columns={0: "count"}).reset_index()
        .pivot(index="annotation", columns=["group","col"], values="count")
        .sort_index(axis=1).T
        .fillna(0)
    )


group,Unnamed: 1_level_0,CTCF,LINE/LTR,LINE/LTR & CTCF,cCRE,cCRE & CTCF,cCRE & LINE/LTR,cCRE & LINE/LTR & CTCF
annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
3' UTR,315.0,3.0,111.0,0.0,138.0,102.0,1.0,4.0
5' UTR,10.0,0.0,1.0,0.0,4.0,4.0,0.0,0.0
Exon,586.0,3.0,114.0,0.0,297.0,232.0,3.0,2.0
Intron,4914.0,67.0,10291.0,37.0,964.0,354.0,307.0,57.0
Distal Intergenic,5192.0,56.0,24048.0,93.0,873.0,502.0,414.0,49.0
Downstream (<=300bp),7.0,0.0,25.0,0.0,8.0,6.0,1.0,0.0
Promoter (<=1kb),1992.0,12.0,1296.0,3.0,8590.0,8547.0,159.0,103.0
Promoter (1-2kb),1094.0,6.0,1255.0,6.0,900.0,692.0,52.0,11.0
Promoter (2-3kb),763.0,13.0,1266.0,2.0,430.0,251.0,55.0,7.0


Unnamed: 0_level_0,annotation,3' UTR,5' UTR,Distal Intergenic,Downstream (<=300bp),Exon,Intron,Promoter (1-2kb),Promoter (2-3kb),Promoter (<=1kb)
group,col,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,,315.0,10.0,5192.0,7.0,586.0,4914.0,1094.0,763.0,1992.0
CTCF,(CTCF-bound) + CTCF-only,3.0,0.0,56.0,0.0,3.0,67.0,6.0,13.0,12.0
LINE/LTR,LINE,23.0,1.0,10936.0,10.0,32.0,5020.0,521.0,457.0,525.0
LINE/LTR,LINE + LTR,20.0,0.0,4709.0,8.0,18.0,1590.0,210.0,202.0,147.0
LINE/LTR,LINE + LTR?,0.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0
LINE/LTR,LTR,68.0,0.0,8396.0,7.0,64.0,3680.0,524.0,607.0,624.0
LINE/LTR,LTR + LTR?,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
LINE/LTR,LTR?,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
LINE/LTR & CTCF,(CTCF-bound) + CTCF-only + LINE,0.0,0.0,39.0,0.0,0.0,13.0,2.0,1.0,2.0
LINE/LTR & CTCF,(CTCF-bound) + CTCF-only + LINE + LTR,0.0,0.0,7.0,0.0,0.0,5.0,0.0,1.0,0.0


In [27]:
(
    flipons_to_data.replace(regex=r"(@(.+?), )|(@(.+?)$)", value=" ")
    .assign(
        ccre_col=lambda x: x["cCRE (+-200bp slop)"].apply(get_rep_cats),
        rep_col=lambda x: x["LINE/LTR repeats (+-200bp slop)"].apply(get_rep_cats),
        col=lambda x: (x["ccre_col"] + " + " + x["rep_col"]).str.strip(" +"),
        feature_group=lambda x: x[["ccre_col", "rep_col"]]
        .apply(get_group, axis=1)
        .str.strip("&"),
    )
    .groupby(["feature_group", "annotation"])["gene_info"]
    .apply(lambda x: ", ".join(np.unique([y.split()[0] for y in x if y != ""])))
    .to_frame()
).to_csv(f_enrichment_table_genes, sep="\t")


In [28]:
data_for_pies = (
    flipons_to_data.replace(regex=r"(@(.+?), )|(@(.+?)$)", value=" ")
    .assign(
        ccre_col=lambda x: x["cCRE (+-200bp slop)"].apply(get_rep_cats),
        rep_col=lambda x: x["LINE/LTR repeats (+-200bp slop)"].apply(get_rep_cats),
        col=lambda x: (x["ccre_col"] + " + " + x["rep_col"]).str.strip(" +"),
        feature_group=lambda x: x[["ccre_col", "rep_col"]]
        .apply(get_group, axis=1)
        .str.strip("&"),
    )
    .loc[:, ["feature_group", "annotation", "group"]]
    # .query('annotation.str.contains("Promoter")')
    .groupby(["annotation", "feature_group"])[["group"]]
    .value_counts()
    .to_frame()
    .rename(columns={0: "count"})
    .reset_index()
)
data_for_pies[['annotation', 'feature_group']] = data_for_pies[['annotation', 'feature_group']].replace('', 'no annotation')
data_for_pies['percentage'] = data_for_pies['count'] / data_for_pies.groupby(['annotation', 'feature_group'])['count'].transform('sum')
data_for_pies['log_count'] = np.log2(data_for_pies['count'])
data_for_pies = data_for_pies.replace({'quadruplex':'G4', 'z-dna':'Z-DNA','h-dna':'H-DNA','sidd':'SIDD'})
data_for_pies

Unnamed: 0,annotation,feature_group,group,count,percentage,log_count
0,3' UTR,no annotation,G4,113,0.358730,6.820179
1,3' UTR,no annotation,Z-DNA,98,0.311111,6.614710
2,3' UTR,no annotation,SIDD,55,0.174603,5.781360
3,3' UTR,no annotation,H-DNA,49,0.155556,5.614710
4,3' UTR,CTCF,SIDD,2,0.666667,1.000000
...,...,...,...,...,...,...
214,Promoter (<=1kb),cCRE & LINE/LTR,SIDD,31,0.194969,4.954196
215,Promoter (<=1kb),cCRE & LINE/LTR & CTCF,Z-DNA,47,0.456311,5.554589
216,Promoter (<=1kb),cCRE & LINE/LTR & CTCF,G4,25,0.242718,4.643856
217,Promoter (<=1kb),cCRE & LINE/LTR & CTCF,H-DNA,20,0.194175,4.321928


In [31]:
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

# data_for_pies = (
#     flipons_to_data.replace(regex=r"(@(.+?), )|(@(.+?)$)", value=" ")
#     .assign(
#         ccre_col=lambda x: x["cCRE (+-200bp slop)"].apply(get_rep_cats),
#         rep_col=lambda x: x["LINE/LTR repeats (+-200bp slop)"].apply(get_rep_cats),
#         col=lambda x: (x["ccre_col"] + " + " + x["rep_col"]).str.strip(" +"),
#         feature_group=lambda x: x[["ccre_col", "rep_col"]]
#         .apply(get_group, axis=1)
#         .str.strip("&"),
#     )
#     .loc[:, ["feature_group", "annotation", "group"]]
#     .query('annotation.str.contains("Promoter")')
#     .groupby(["annotation", "feature_group"])[["group"]]
#     .value_counts()
#     .to_frame()
#     .rename(columns={0: "count"})
#     .reset_index()
# )

# data_for_pies['feature_group'] = data_for_pies['feature_group'].replace('', 'no annotation')
# ns = data_for_pies.groupby(['annotation', 'feature_group'])['count'].sum().to_frame()

# # Create subplots: use 'domain' type for Pie subplot

# annotations = data_for_pies["annotation"].unique()
# feature_groups = list(filter(lambda x: x in ['no annotation', 'cCRE', 'Repeats', 'cCRE&&CTCF'],data_for_pies["feature_group"].unique()))

# nrows = len(annotations)
# ncols = len(feature_groups)

# subplot_titles = []
# for anno in annotations:
#     for feat in feature_groups:
#         subplot_titles.append(ns.loc[(anno, feat),'count'])

# fig = make_subplots(
#     rows=nrows,
#     cols=ncols,
#     specs=np.full((nrows, ncols), {"type": "domain"}).tolist(),
#     shared_xaxes="rows",
#     shared_yaxes="columns",
#     column_titles=tuple(feature_groups),
#     row_titles=tuple(annotations),
#     subplot_titles = [f"n={x:,d}" for x in subplot_titles]
# )

# x_anno_coords = np.arange(0.105, 1, 0.257)
# y_anno_coords = [0.875, 0.485, 0.095]
# # https://stackoverflow.com/questions/65563922/how-to-change-subplot-title-after-creation-in-plotly
# for i in range(nrows*ncols):
#     fig.layout.annotations[i].update(font_size=12, x=x_anno_coords[i%ncols], y=y_anno_coords[i//ncols])

# for i, annotation in enumerate(annotations):
#     for j, feature_group in enumerate(feature_groups):
#         sub_df = data_for_pies.query(
#             "annotation==@annotation and feature_group==@feature_group"
#         )
#         fig.add_trace(
#             go.Pie(labels=sub_df["group"], values=sub_df["count"], name=annotation),
#             i + 1,
#             j + 1,
#         )

# # Use `hole` to create a donut-like pie chart
# fig.update_traces(hole=0.4, hoverinfo="label+percent+name")
# fig.update_layout(
#     title_text="Flipon distribution among different feature categories",
#     height=800,
#     width=1200,
# )
# fig.show()


## Task 1

The counts for the other flipons that are associated with the CCRE positive promoters (with no repeats) and those with the repeat promoters (and noCCRE). Is there any difference?

In [None]:
def get_repeat_category(s: pd.Series):
    # C1: Repeats and no cCRE
    if s[0] == "" and s[1] != "":
        return "C1"
    # C2: cCRE and no Repeats
    if s[0] != "" and s[1] == "":
        return "C2"
    # C3: Repeats and cCRE
    if s[0] != "" and s[1] != "":
        return "C3"
    # C4: no Repeats and no cCRE
    return "C4"


with pd.option_context("display.max_columns", None):
    display(
        flipons_to_data.assign(
            category=lambda x: x[
                ["cCRE (+-200bp slop)", "LINE/LTR repeats (+-200bp slop)"]
            ].apply(get_repeat_category, axis=1),
        )
        .groupby(["annotation", "category"])["group"]
        .value_counts()
        .to_frame()
        .sort_index(ascending=[True, True, False]).rename(columns={'group':'total'})
        .reset_index()
        .pivot(index=['group'],columns=['annotation', 'category'],values=['total'])
        .sort_index(ascending=False)
        .fillna(0)
    )


Unnamed: 0_level_0,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total
annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,3' UTR,3' UTR,3' UTR,3' UTR,5' UTR,5' UTR,5' UTR,Distal Intergenic,Distal Intergenic,Distal Intergenic,Distal Intergenic,Downstream (<=300bp),Downstream (<=300bp),Downstream (<=300bp),Exon,Exon,Exon,Exon,Intron,Intron,Intron,Intron,Promoter (1-2kb),Promoter (1-2kb),Promoter (1-2kb),Promoter (1-2kb),Promoter (2-3kb),Promoter (2-3kb),Promoter (2-3kb),Promoter (2-3kb),Promoter (<=1kb),Promoter (<=1kb),Promoter (<=1kb),Promoter (<=1kb)
category,C1,C2,C3,C4,C1,C2,C3,C4,C1,C2,C4,C1,C2,C3,C4,C1,C2,C4,C1,C2,C3,C4,C1,C2,C3,C4,C1,C2,C3,C4,C1,C2,C3,C4,C1,C2,C3,C4
group,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3,Unnamed: 24_level_3,Unnamed: 25_level_3,Unnamed: 26_level_3,Unnamed: 27_level_3,Unnamed: 28_level_3,Unnamed: 29_level_3,Unnamed: 30_level_3,Unnamed: 31_level_3,Unnamed: 32_level_3,Unnamed: 33_level_3,Unnamed: 34_level_3,Unnamed: 35_level_3,Unnamed: 36_level_3,Unnamed: 37_level_3,Unnamed: 38_level_3
z-dna,3079.0,8111.0,171.0,3697.0,14.0,28.0,0.0,52.0,1.0,3.0,0.0,2673.0,226.0,79.0,629.0,4.0,2.0,1.0,20.0,132.0,2.0,167.0,883.0,155.0,36.0,581.0,143.0,237.0,9.0,181.0,174.0,94.0,4.0,122.0,175.0,2772.0,47.0,353.0
sidd,4580.0,215.0,71.0,343.0,33.0,25.0,2.0,40.0,0.0,0.0,1.0,5255.0,72.0,87.0,169.0,8.0,1.0,0.0,32.0,7.0,1.0,17.0,2288.0,74.0,81.0,200.0,361.0,72.0,12.0,63.0,295.0,35.0,17.0,41.0,353.0,316.0,36.0,91.0
quadruplex,2825.0,5064.0,118.0,1786.0,15.0,41.0,1.0,56.0,0.0,2.0,4.0,3388.0,232.0,82.0,295.0,3.0,2.0,2.0,15.0,41.0,2.0,95.0,1526.0,182.0,46.0,334.0,224.0,188.0,14.0,114.0,197.0,94.0,7.0,88.0,193.0,2576.0,35.0,364.0
h-dna,4095.0,842.0,151.0,3313.0,11.0,15.0,0.0,30.0,0.0,0.0,1.0,3428.0,138.0,108.0,646.0,5.0,0.0,0.0,13.0,8.0,0.0,25.0,1472.0,140.0,79.0,560.0,198.0,93.0,14.0,123.0,212.0,46.0,13.0,83.0,216.0,773.0,41.0,206.0


In [None]:
def get_mirna_category(x: list):
    if len(x) == 0 or x[0] == "":
        return "T1"
    if len(x) == 1:
        if "(" in x[0]:
            return "T3"
        return "T2"
    return "T4"


with pd.option_context("display.max_columns", None):
    display(
        flipons_to_data.query('annotation.str.contains("Promoter")')
        .assign(
            mirna_p_list=lambda x: x["miRNA (+ strand)"].str.split(", "),
            mirna_m_list=lambda x: x["miRNA (- strand)"].str.split(", "),
            ccre_repeat_category=lambda x: x[
                ["cCRE (+-200bp slop)", "LINE/LTR repeats (+-200bp slop)"]
            ].apply(get_repeat_category, axis=1),
            F=lambda x: x["mirna_p_list"].apply(get_mirna_category),
            RC=lambda x: x["mirna_m_list"].apply(get_mirna_category),
        )
        .melt(
            id_vars=["annotation", "group"],
            value_vars=["F", "RC"],
        )
        .groupby(["annotation", "group", "value"])["variable"]
        .value_counts()
        .to_frame()
        .rename(columns={'variable':'total'})
        .sort_index(ascending=[True, False, True, True])
        .reset_index()
        .pivot(index=['group'],columns=['annotation', 'value', 'variable'],values=['total'])
        .sort_index(ascending=False)
        .fillna(0)
    )


Unnamed: 0_level_0,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total,total
annotation,Promoter (1-2kb),Promoter (1-2kb),Promoter (1-2kb),Promoter (1-2kb),Promoter (1-2kb),Promoter (1-2kb),Promoter (1-2kb),Promoter (1-2kb),Promoter (2-3kb),Promoter (2-3kb),Promoter (2-3kb),Promoter (2-3kb),Promoter (2-3kb),Promoter (2-3kb),Promoter (2-3kb),Promoter (2-3kb),Promoter (<=1kb),Promoter (<=1kb),Promoter (<=1kb),Promoter (<=1kb),Promoter (<=1kb),Promoter (<=1kb),Promoter (<=1kb),Promoter (<=1kb)
value,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T3,T4,T4
variable,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC
group,Unnamed: 1_level_4,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4,Unnamed: 23_level_4,Unnamed: 24_level_4
z-dna,497.0,501.0,59.0,58.0,6.0,3.0,8.0,8.0,329.0,333.0,48.0,44.0,7.0,9.0,10.0,8.0,3030.0,3033.0,279.0,285.0,14.0,11.0,24.0,18.0
sidd,91.0,92.0,160.0,155.0,16.0,23.0,241.0,238.0,83.0,77.0,95.0,102.0,19.0,22.0,191.0,187.0,200.0,212.0,220.0,232.0,29.0,18.0,347.0,334.0
quadruplex,429.0,408.0,88.0,107.0,8.0,2.0,15.0,23.0,291.0,312.0,78.0,57.0,5.0,4.0,12.0,13.0,2368.0,2457.0,656.0,578.0,32.0,31.0,112.0,102.0
h-dna,401.0,384.0,19.0,39.0,8.0,5.0,0.0,0.0,324.0,316.0,24.0,33.0,6.0,4.0,0.0,1.0,1140.0,1140.0,80.0,84.0,11.0,10.0,5.0,2.0


In [None]:
with pd.option_context("display.max_columns", None):
    display(
        flipons_to_data
        .query('annotation.str.contains("Promoter")')
        .assign(
            mirna_p_list=lambda x: x["miRNA (+ strand)"].str.split(", "),
            mirna_m_list=lambda x: x["miRNA (- strand)"].str.split(", "),
            ccre_repeat_category=lambda x: x[
                ["cCRE (+-200bp slop)", "LINE/LTR repeats (+-200bp slop)"]
            ].apply(get_repeat_category, axis=1),
            F=lambda x: x["mirna_p_list"].apply(get_mirna_category),
            RC=lambda x: x["mirna_m_list"].apply(get_mirna_category),
        )
        .melt(id_vars=['ccre_repeat_category', 'group'], value_vars=['F', 'RC'])
        .groupby(['group', 'ccre_repeat_category', 'value'])[['variable']].value_counts().to_frame().sort_index(ascending=[False, True, True, True]).T
    )


group,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,z-dna,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,sidd,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,quadruplex,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna,h-dna
ccre_repeat_category,C1,C1,C1,C1,C1,C1,C1,C1,C2,C2,C2,C2,C2,C2,C2,C2,C3,C3,C3,C3,C3,C3,C3,C4,C4,C4,C4,C4,C4,C4,C4,C1,C1,C1,C1,C1,C1,C1,C1,C2,C2,C2,C2,C2,C2,C2,C2,C3,C3,C3,C3,C3,C3,C3,C3,C4,C4,C4,C4,C4,C4,C4,C1,C1,C1,C1,C1,C1,C1,C1,C2,C2,C2,C2,C2,C2,C2,C2,C3,C3,C3,C3,C3,C3,C3,C3,C4,C4,C4,C4,C4,C4,C4,C4,C1,C1,C1,C1,C1,C1,C2,C2,C2,C2,C2,C2,C2,C2,C3,C3,C3,C3,C3,C4,C4,C4,C4,C4,C4,C4,C4
value,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T4,T4,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T4,T4,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T3,T1,T1,T2,T2,T3,T3,T4,T4,T1,T1,T2,T2,T3,T1,T1,T2,T2,T3,T3,T4,T4
variable,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,F,RC,RC,F,RC,F,RC,F,RC,F,RC
0,401,416,68,62,10,5,13,9,2839,2842,237,240,8,5,19,16,50,52,9,6,1,1,1,566,557,72,79,9,12,9,8,189,204,266,277,48,55,506,473,126,125,134,131,10,5,153,162,14,10,19,24,3,3,29,28,45,42,56,57,3,91,96,485,491,110,97,1,6,18,20,2130,2223,596,516,30,24,102,95,40,38,11,14,2,1,3,3,433,425,105,115,12,6,16,20,582,573,34,47,10,6,841,833,60,72,8,5,3,2,65,61,3,5,2,377,373,26,32,7,6,2,1


## Task 2

The  overlap with DNA methylation at CCRE positive elements (but no repeats)  at all genomic locations and the same for repeats (with no CCRE overlap). Are repeat promoters preferentially methylated?


### Map flipons to: H3K9ac and H3K14ac

In [None]:
# with pd.option_context("display.max_columns", None):
#     display(
#         flipons_to_data.assign(
#             category=lambda x: x[
#                 ["cCRE (+-200bp slop)", "LINE/LTR repeats (+-200bp slop)"]
#             ].apply(get_repeat_category, axis=1),
#             chr=lambda x: x['coordinates'].str.split(':', expand=True)[0],
#             start=lambda x: x['coordinates'].str.split(':', expand=True)[1].str.split('-', expand=True)[0],
#             end=lambda x: x['coordinates'].str.split(':', expand=True)[1].str.split('-', expand=True)[1],
#         )
#     )

In [None]:
# for flipon in tqdm(flipon_data):
#     intersection_path_k9 = f"{d_flipons_and_histones}/{flipon.name}_and_h3k9ac.bed"
#     intersection_path_k14 = f"{d_flipons_and_histones}/{flipon.name}_and_h3k14ac.bed"
#     !bedtools intersect -a {flipon.path_bed} -b {f_h3k9ac} -u > {intersection_path_k9}
#     !bedtools intersect -a {flipon.path_bed} -b {f_h3k14ac} -u > {intersection_path_k14}

In [None]:
# flipons_to_histones = pd.DataFrame()

# for entry in os.scandir(d_flipons_and_histones):
#     flipon_name = entry.name.split("_")[0]
#     histone_name = entry.name.split("_")[2].split(".")[0]

#     df = (
#         pd.read_table(entry.path, header=None)[[3]]
#         .rename(columns={3: "coordinates"})
#         .assign(group=flipon_name, histone=histone_name)
#     )
#     flipons_to_histones = pd.concat([flipons_to_histones, df], ignore_index=True)

# flipons_to_histones = flipons_to_data.merge(
#     (
#         flipons_to_histones.assign(
#             histones_join=lambda x: x.groupby(["group", "coordinates"])[
#                 "histone"
#             ].transform(lambda x: ", ".join(set(x)))
#         )
#         .drop_duplicates(subset=["coordinates"])
#         .loc[:, ["group", "coordinates", "histones_join"]]
#         .reset_index(drop=True)
#     ),
#     on=["group", "coordinates"],
#     how="left",
# ).assign(
#     category=lambda x: x[
#         ["cCRE (+-200bp slop)", "LINE/LTR repeats (+-200bp slop)"]
#     ].apply(get_repeat_category, axis=1)
# )

# with pd.option_context("display.max_columns", None):
#     display(
#         flipons_to_histones
#         .fillna('')
#         .groupby(['group', 'annotation', 'category'])['histones_join'].value_counts()
#         .to_frame()
#         .rename(columns={'histones_join':'total'})
#         .sort_index(ascending=[False, True, True, True])
#         .reset_index()
#         .pivot(index=['annotation', 'group'],columns=['category', 'histones_join'],values=['total'])
#         .sort_index(ascending=[False, False])
#         .fillna(0)
#     )


## Task 3

Are the miRNAs that bind repeat promoters (no overlap with CCRE) the same or different from the CCRE promoters (no overlap with repeats)?

1. Pick all **promoter regions** with **repeats** but **no cCREs** and make a subset of miRNAs
2. Pick all **promoter regions** with **cCREs** but **no repeats** and make a subset of miRNAs
3. Make a barplot

In [None]:
form_df_1 = (
    flipons_to_data.query(
        'annotation.str.contains("Promoter") and \
        `cCRE (+-200bp slop)` == "" and \
        `LINE/LTR repeats (+-200bp slop)` != ""'
    )
    .replace(regex=" \(\d+\)", value="")
    .assign(type="Repeats without cCREs")
)

form_df_2 = (
    flipons_to_data.query(
        'annotation.str.contains("Promoter") and \
        `cCRE (+-200bp slop)` != "" and \
        `LINE/LTR repeats (+-200bp slop)` == ""'
    )
    .replace(regex=" \(\d+\)", value="")
    .assign(type="cCREs without Repeats")
)

form_df = (
    pd.concat([form_df_1, form_df_2], ignore_index=True)
    .assign(
        mirna_p=lambda x: x["miRNA (+ strand)"].apply(lambda x: x.split(", ")),
        mirna_m=lambda x: x["miRNA (- strand)"].apply(lambda x: x.split(", ")),
    )
    .explode("mirna_p")
    .explode("mirna_m")
    .loc[:, ["type", "annotation", "group", "mirna_p", "mirna_m"]]
    # .assign(mirna_strand=lambda x: x['mirna_p']-x['mirna_m'])
    .melt(
        id_vars=["type", "annotation", "group"],
        value_vars=["mirna_p", "mirna_m"],
        var_name="strand",
        value_name="mirna",
    )
    .groupby(["mirna", "type", "group"])["strand"]
    .value_counts()
    .to_frame()
    .rename(columns={"strand": "total_count"})
    .reset_index()
    .replace("", np.nan)
    .dropna()
    # .pivot(index=['mirna', 'type', 'group'], columns=['strand'], values=['total_count'])
    # .reset_index()
    # .fillna(0)
    # .assign(mirna_difference=lambda x: x[('total_count', 'mirna_p')] - x[('total_count', 'mirna_m')])
    # .droplevel(1, axis=1)
    .replace('quadruplex', 'g4')
    .sort_values(["mirna", 'type', 'group'], ascending=[False, False, False])
)

form_df['total_count'] = form_df.apply(lambda x: -x['total_count'] if x['strand']=='mirna_m' else x['total_count'], axis=1)

# form_df.columns = ['mirna', 'type', 'group', '- strand', '+ strand', 'strand difference']

form_df['total_group_counts'] = form_df.assign(total_count=lambda x: x.total_count.apply(abs)).groupby('group')['total_count'].transform('sum')
form_df['total_count_rel'] = form_df['total_count'] / form_df['total_group_counts'] * 100

form_df


Unnamed: 0,mirna,type,group,strand,total_count,total_group_counts,total_count_rel
1062,miR-99/100,cCREs without Repeats,z-dna,mirna_p,1,771,0.129702
1061,miR-99/100,cCREs without Repeats,sidd,mirna_m,-1,8859,-0.011288
1060,miR-99/100,cCREs without Repeats,g4,mirna_m,-1,2007,-0.049826
1059,miR-96/1271,cCREs without Repeats,z-dna,mirna_p,1,771,0.129702
1057,miR-96/1271,cCREs without Repeats,sidd,mirna_m,-5,8859,-0.056440
...,...,...,...,...,...,...,...
19,let-7/miR-98,cCREs without Repeats,h-dna,mirna_m,-1,252,-0.396825
20,let-7/miR-98,cCREs without Repeats,g4,mirna_p,1,2007,0.049826
17,let-7/miR-98,Repeats without cCREs,sidd,mirna_p,5,8859,0.056440
18,let-7/miR-98,Repeats without cCREs,sidd,mirna_m,-2,8859,-0.022576


In [None]:
fig = px.bar(
    form_df,
    y="mirna",
    x="total_count_rel",
    color="strand",
    facet_col="group",
    orientation='h',
    facet_row="type",
    hover_data=['total_count', 'total_group_counts'],
    # barmode="group",
    # category_orders={"day": ["Thur", "Fri", "Sat", "Sun"], "time": ["Lunch", "Dinner"]},
    color_discrete_map={'mirna_p':'tomato', 'mirna_m':'royalblue'},
)

fig.for_each_annotation(lambda a: a.update(text=f'<b>{a.text.split("=")[1]}</b>'))

fig.update_yaxes(categoryorder='array', categoryarray=form_df['mirna'].unique())
fig.update_layout(
    title="miRNAs that bind <b>repeat promoters (no overlap with CCRE)</b> vs <b>the CCRE promoters (no overlap with repeats)</b>",
    height=920,
    width=1700,
)
fig.write_html(f_plotly_ccre_vs_repeats_3)
fig.show()


## Task 4

In [None]:
form_df_1 = (
    flipons_to_data.query(
        'annotation.str.contains("Promoter") and \
        `cCRE (+-200bp slop)` == "" and \
        `LINE/LTR repeats (+-200bp slop)` != ""'
    )
    .replace(regex=" \(\d+\)", value="")
    .assign(type="Repeats without cCREs")
)

form_df_2 = (
    flipons_to_data.query(
        'annotation.str.contains("Promoter") and \
        `cCRE (+-200bp slop)` != "" and \
        `LINE/LTR repeats (+-200bp slop)` == ""'
    )
    .replace(regex=" \(\d+\)", value="")
    .assign(type="cCREs without Repeats")
)

form_df = (
    pd.concat([form_df_1, form_df_2], ignore_index=True)
    .assign(
        p=lambda x: x["miRNA (+ strand)"].apply(lambda x: x.split(", ")),
        m=lambda x: x["miRNA (- strand)"].apply(lambda x: x.split(", ")),
        gene_strand=lambda x: x['gene_info'].str[-3:].str[1].apply(lambda y: 'p' if y=="+" else 'm')
    )
    .explode("p")
    .explode("m")
    .loc[:, ["type", "annotation", 'gene_strand', "group", "p", "m"]]
    .melt(
        id_vars=["type", "annotation", "gene_strand", "group"],
        value_vars=["p", "m"],
        var_name="mirna_strand",
        value_name="mirna",
    )
    .assign(
        mirna_gene_strand_orientation = lambda x: x[['gene_strand', 'mirna_strand']].apply(lambda y: 'same' if y[0]==y[1] else 'different', axis=1)
    )
    .groupby(["mirna", "type", "group"])[["mirna_gene_strand_orientation"]]
    .value_counts()
    .to_frame()
    .rename(columns={0: "total_count"})
    .reset_index()
    .replace("", np.nan)
    .dropna()
    .replace('quadruplex', 'g4')
    .sort_values(["mirna", 'type', 'group'], ascending=[False, False, False])
    .reset_index(drop=True)
)

form_df['total_count_signed'] = form_df.apply(lambda x: x.total_count if x.mirna_gene_strand_orientation=="same" else -x.total_count, axis=1)

form_df['total_group_count'] = form_df.groupby('group')['total_count'].transform('sum')
form_df['total_group_count_rel'] = form_df['total_count'] / form_df['total_group_count'] * 100
form_df['total_group_signed_rel'] = form_df.apply(lambda x: x.total_group_count_rel if x.mirna_gene_strand_orientation=="same" else -x.total_group_count_rel, axis=1)

form_df['total_mirna_count'] = form_df.groupby('mirna')['total_count'].transform('sum')
form_df['total_mirna_count_rel'] = form_df['total_count'] / form_df['total_mirna_count'] * 100
form_df['total_mirna_signed_rel'] = form_df.apply(lambda x: x.total_mirna_count_rel if x.mirna_gene_strand_orientation=="same" else -x.total_mirna_count_rel, axis=1)

form_df['total_mirna_group_count'] = form_df.groupby(['mirna', 'group'])['total_count'].transform('sum')
form_df['total_mirna_group_count_rel'] = form_df['total_count'] / form_df['total_mirna_group_count'] * 100
form_df['total_mirna_group_signed_rel'] = form_df.apply(lambda x: x.total_mirna_group_count_rel if x.mirna_gene_strand_orientation=="same" else -x.total_mirna_group_count_rel, axis=1)

form_df['custom_type'] = form_df['type'] + ", " + form_df['mirna_gene_strand_orientation']

form_df


Unnamed: 0,mirna,type,group,mirna_gene_strand_orientation,total_count,total_count_signed,total_group_count,total_group_count_rel,total_group_signed_rel,total_mirna_count,total_mirna_count_rel,total_mirna_signed_rel,total_mirna_group_count,total_mirna_group_count_rel,total_mirna_group_signed_rel,custom_type
0,miR-99/100,cCREs without Repeats,z-dna,same,1,1,771,0.129702,0.129702,3,33.333333,33.333333,1,100.000000,100.000000,"cCREs without Repeats, same"
1,miR-99/100,cCREs without Repeats,sidd,different,1,-1,8859,0.011288,-0.011288,3,33.333333,-33.333333,1,100.000000,-100.000000,"cCREs without Repeats, different"
2,miR-99/100,cCREs without Repeats,g4,same,1,1,2007,0.049826,0.049826,3,33.333333,33.333333,1,100.000000,100.000000,"cCREs without Repeats, same"
3,miR-96/1271,cCREs without Repeats,z-dna,different,1,-1,771,0.129702,-0.129702,25,4.000000,-4.000000,1,100.000000,-100.000000,"cCREs without Repeats, different"
4,miR-96/1271,cCREs without Repeats,sidd,same,5,5,8859,0.056440,0.056440,25,20.000000,20.000000,24,20.833333,20.833333,"cCREs without Repeats, same"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,let-7/miR-98,cCREs without Repeats,h-dna,same,1,1,252,0.396825,0.396825,16,6.250000,6.250000,2,50.000000,50.000000,"cCREs without Repeats, same"
1054,let-7/miR-98,cCREs without Repeats,g4,different,1,-1,2007,0.049826,-0.049826,16,6.250000,-6.250000,1,100.000000,-100.000000,"cCREs without Repeats, different"
1055,let-7/miR-98,Repeats without cCREs,sidd,same,6,6,8859,0.067728,0.067728,16,37.500000,37.500000,13,46.153846,46.153846,"Repeats without cCREs, same"
1056,let-7/miR-98,Repeats without cCREs,sidd,different,1,-1,8859,0.011288,-0.011288,16,6.250000,-6.250000,13,7.692308,-7.692308,"Repeats without cCREs, different"


In [None]:
fig = px.bar(
    form_df,
    y="mirna",
    x="total_group_count_rel",
    color="mirna_gene_strand_orientation",
    facet_col="group",
    orientation='h',
    facet_row="type",
    hover_data=['total_count', 'total_group_count'],
    # barmode="group",
    # category_orders={"day": ["Thur", "Fri", "Sat", "Sun"], "time": ["Lunch", "Dinner"]},
    color_discrete_map={'same':'tomato', 'different':'royalblue'},
)

fig.for_each_annotation(lambda a: a.update(text=f'<b>{a.text.split("=")[1]}</b>'))

fig.update_yaxes(categoryorder='array', categoryarray=form_df['mirna'].unique())
fig.update_layout(
    title="miRNAs that bind <b>repeat promoters (no overlap with CCRE)</b> vs <b>the CCRE promoters (no overlap with repeats)</b>",
    height=920,
    width=1700,
)
fig.write_html(f_plotly_ccre_vs_repeats_4)
fig.show()


In [None]:
# quartiles -> filter out unnecessaries
# also plot a hiistogram
# format for 2 pages

# Aire: Z-DNA bert (!)
# form_df.total_counts

In [None]:
# https://plotly.com/python/legend/#grouped-legend-items
fig = px.bar(
    form_df,
    x='total_mirna_group_signed_rel',
    y='mirna',
    facet_col='group',
    text='total_mirna_group_count',
    color='custom_type',
    color_discrete_map={'cCREs without Repeats, same':'darksalmon', 'Repeats without cCREs, same':'tomato', 'cCREs without Repeats, different':'cornflowerblue', 'Repeats without cCREs, different':'royalblue',},
    category_orders={"group": ['z-dna', 'g4', 'sidd', 'h-dna'], "mirna": form_df['mirna'].sort_values(), "custom_type":['Repeats without cCREs, same', "cCREs without Repeats, same", 'Repeats without cCREs, different', "cCREs without Repeats, different"]},
    # hover_data=['total_mirna_group_count'],
)

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1].upper()))

fig.update_layout(
    title='miRNA strand co-direction between two non-overlapping flipon groups',
    height=3500, width=1500,
)

fig.write_image('../img/flipon_strand_distribution_mirna_flipon_type_relative.png')
fig.show()

In [None]:
fig = px.bar(
    form_df,
    x='total_mirna_signed_rel',
    y='mirna',
    facet_col='group',
    text='total_mirna_group_count',
    color='custom_type',
    color_discrete_map={'cCREs without Repeats, same':'darksalmon', 'Repeats without cCREs, same':'tomato', 'cCREs without Repeats, different':'cornflowerblue', 'Repeats without cCREs, different':'royalblue',},
    category_orders={"group": ['z-dna', 'g4', 'sidd', 'h-dna'], "mirna": form_df['mirna'].sort_values(), "custom_type":['Repeats without cCREs, same', "cCREs without Repeats, same", 'Repeats without cCREs, different', "cCREs without Repeats, different"]},
    # hover_data=['total_mirna_group_count'],
)

fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[1].upper()))

fig.update_layout(
    title='miRNA strand co-direction between two non-overlapping flipon groups',
    height=3500, width=1500,
)

fig.write_image('../img/flipon_strand_distribution_mirna_relative.png')
fig.show()