## Misc

In [1]:
import os
import pandas as pd
import subprocess as sp

from datapaths import *

from Bio import SeqIO
from collections import Counter
from dataclasses import dataclass
from tqdm.notebook import tqdm

!mkdir -p {D_DATA} {D_UTILS} {D_GENOME} {D_LIFTOVER} {D_TABLES} {D_FLIPONS_BED} {D_FLIPONS_FA} {D_CCRE} {D_RMSK} {D_IMG} {D_GEBR}

## Data

In [3]:
"""Get Kouzine ssDNA data."""

!wget -c -q --show-progress -P {D_DATA} https://www.ncbi.nlm.nih.gov/CBBresearch/Przytycka/software/nonbdna/nonB_DNA_ssDNA_enriched.tar

In [4]:
"""Get mm10 genome data."""

!wget -c -q --show-progress -O {F_CHROM_SIZES} http://hgdownload.cse.ucsc.edu/goldenpath/mm10/bigZips/mm10.chrom.sizes
!wget -c -q --show-progress -P {D_GENOME} https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz
!cd {D_GENOME} && gzip -cdn mm10.fa.gz > mm10.fa

In [5]:
"""Get mm10 gene annotation."""

!wget -c -q --show-progress -O {F_GENCODE} http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz

In [6]:
"""Get liftOver tool files."""

!wget -c -q --show-progress -O {F_LIFTOVER} http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
!wget -c -q --show-progress -O {F_LIFTOVER_CHAIN} https://hgdownload.cse.ucsc.edu/goldenpath/mm9/liftOver/mm9ToMm10.over.chain.gz
!chmod +x {F_LIFTOVER}

In [7]:
"""Get bigBedToBed."""

!wget -c -q --show-progress -O {F_BIGBEDTOBED} https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bigBedToBed
!chmod +x {F_BIGBEDTOBED}

In [8]:
"""Get cCRE data."""

!chmod +x {F_BIGBEDTOBED}
!{F_BIGBEDTOBED} http://hgdownload.soe.ucsc.edu/gbdb/mm10/encode3/ccre/encodeCcreCombined.bb stdout > {F_CCRE}

(
    pd.read_table(F_CCRE, header=None)
    .assign(
        name_preformatted=lambda x: x[9] + "@" + x[3],
        name=lambda x: x["name_preformatted"].str.replace(
            ",CTCF-bound", " (CTCF-bound)"
        ),
    )
    .loc[:, [0, 1, 2, "name"]]
).to_csv(F_CCRE, sep="\t", header=False, index=False)


In [9]:
"""Get rmsk data."""

!wget -c -q --show-progress -O {F_RMSK} https://hgdownload.soe.ucsc.edu/goldenPath/mm10/database/rmsk.txt.gz

columns = [
    "bin", "swScore", "milliDiv", "milliDel", "milliIns", "genoName", "genoStart", "genoEnd", "genoLeft", "strand", "repName", "repClass", "repFamily", "repStart", "repEnd", "repLeft", "id"
]

(
    pd.read_table(F_RMSK, header=None, names=columns, compression="gzip")
    .query('repClass.str.contains("LTR") or repClass.str.contains("LINE")')
    .assign(
        name=lambda x: x["repClass"] + "@" + x["repName"] + "(" + x["strand"] + ")"
    )
    .loc[:, ["genoName", "genoStart", "genoEnd", "name", "swScore", "strand"]]
    .to_csv(F_RMSK_LINE_LTR, sep="\t", header=None, index=None)
)


## Preprocessing

### Uplift Kouzine peaks

In [10]:
"""Uplift from mm9 to mm10."""

!rm -r {D_DATA}/nonB_DNA_ssDNA_enriched
!tar -xvf {D_DATA}/nonB_DNA_ssDNA_enriched.tar -C {D_DATA}
!cd {D_DATA}/nonB_DNA_ssDNA_enriched/mouse_mm9 && ls | xargs gzip -df

!rm -r {D_DATA}/nonB_DNA_ssDNA_enriched/mouse_mm10 ; mkdir -p {D_DATA}/nonB_DNA_ssDNA_enriched/mouse_mm10
!rm -r {D_FLIPONS_BED} ; mkdir -p {D_FLIPONS_BED}

for file in (D_DATA / 'nonB_DNA_ssDNA_enriched/mouse_mm9').iterdir():
    new_file_path = Path(str(file).replace('mm9', 'mm10')).parent / ("mm10." + file.name.replace('Quadruplex', 'G4').lower())
    print(f"{file.name} -> {new_file_path.name}")

    !tail -n +2 {file} > temp && mv temp {file}
    !{F_LIFTOVER} {file} {F_LIFTOVER_CHAIN} {new_file_path} unmapped_regions.bed
    !cat unmapped_regions.bed | grep "#Deleted in new" | wc -l ; rm unmapped_regions.bed

    !sort --unique -k1,1 -k2,2n -o {new_file_path} {new_file_path}


nonB_DNA_ssDNA_enriched/
nonB_DNA_ssDNA_enriched/human_hg19/
nonB_DNA_ssDNA_enriched/human_hg19/Raji_ssDNA_enriched_H-DNA.bed.gz
nonB_DNA_ssDNA_enriched/human_hg19/Raji_ssDNA_enriched_Quadruplex.bed.gz
nonB_DNA_ssDNA_enriched/human_hg19/Raji_ssDNA_enriched_SIDD.bed.gz
nonB_DNA_ssDNA_enriched/human_hg19/Raji_ssDNA_enriched_Z-DNA.bed.gz
nonB_DNA_ssDNA_enriched/mouse_mm9/
nonB_DNA_ssDNA_enriched/mouse_mm9/actB_ssDNA_enriched_H-DNA.bed.gz
nonB_DNA_ssDNA_enriched/mouse_mm9/actB_ssDNA_enriched_Quadruplex.bed.gz
nonB_DNA_ssDNA_enriched/mouse_mm9/actB_ssDNA_enriched_SIDD.bed.gz
nonB_DNA_ssDNA_enriched/mouse_mm9/actB_ssDNA_enriched_Z-DNA.bed.gz
rm: cannot remove '/home/fpavlov/projects/article_conserved_miRNA/data/nonB_DNA_ssDNA_enriched/mouse_mm10': No such file or directory
actB_ssDNA_enriched_Z-DNA.bed -> mm10.actb_ssdna_enriched_z-dna.bed
Reading liftover chains
Mapping coordinates
2
actB_ssDNA_enriched_H-DNA.bed -> mm10.actb_ssdna_enriched_h-dna.bed
Reading liftover chains
Mapping coordina

### Slops

In [11]:
"""Calculate slops for each bedfile (required to find overlaps with miRNA)."""

slop_values = [100, 200, 500]
for file in (D_DATA / "nonB_DNA_ssDNA_enriched/mouse_mm10").iterdir():
    print(file.name)
    flipon_name = file.name.split("ed_")[1][:-4]
    path_to_flipon = D_FLIPONS_BED / file.name

    flipon_data = (
        pd.read_table(file, header=None).astype({1: int, 2: int}).sort_values([0, 1])
    )
    flipon_data[3] = flipon_data[0] + ":" + flipon_data[1].astype(str) + "-" + flipon_data[2].astype(str)
    if len(flipon_data.columns) == 4:
        flipon_data[4] = 0
        flipon_data[5] = "."
    flipon_data.to_csv(path_to_flipon, sep="\t", header=False, index=False)

    for slop_val in slop_values:
        path_to_slop = str(path_to_flipon).replace(".bed", f".slop{slop_val}.bed")
        !bedtools slop -i {path_to_flipon} -g {F_CHROM_SIZES} -b {slop_val} > {path_to_slop}

mm10.actb_ssdna_enriched_g4.bed
mm10.actb_ssdna_enriched_z-dna.bed
mm10.actb_ssdna_enriched_sidd.bed
mm10.actb_ssdna_enriched_h-dna.bed


### Intersections

In [12]:
"""Intersect other flipons with SIDD."""

kouzine_data = [
    D_FLIPONS_BED / "mm10.actb_ssdna_enriched_z-dna.bed",
    D_FLIPONS_BED / "mm10.actb_ssdna_enriched_g4.bed",
    D_FLIPONS_BED / "mm10.actb_ssdna_enriched_h-dna.bed",
]

sidd_slops = [
    D_FLIPONS_BED / "mm10.actb_ssdna_enriched_sidd.bed",
    *[D_FLIPONS_BED / f"mm10.actb_ssdna_enriched_sidd.slop{x}.bed" for x in slop_values],
]

for a in kouzine_data:
    for b in sidd_slops:
        path_to_intersection = str(a).replace(
            ".bed", f'_and{str(b).split("enriched")[1]}'
        )
        with open(path_to_intersection, "w") as f_out:
            sp.run(
                ["bedtools", "intersect", "-a", a, "-b", b, "-u"],
                check=True,
                stdout=f_out,
            )

        shape = sp.run(
            f"wc -l {path_to_intersection}",
            shell=True,
            capture_output=True,
            encoding="utf-8",
        ).stdout
        print(f"{Path(shape.split()[1]).name}: {int(shape.split()[0]):,d}")


mm10.actb_ssdna_enriched_z-dna_and_sidd.bed: 256
mm10.actb_ssdna_enriched_z-dna_and_sidd.slop100.bed: 758
mm10.actb_ssdna_enriched_z-dna_and_sidd.slop200.bed: 1,117
mm10.actb_ssdna_enriched_z-dna_and_sidd.slop500.bed: 1,802
mm10.actb_ssdna_enriched_g4_and_sidd.bed: 1,374
mm10.actb_ssdna_enriched_g4_and_sidd.slop100.bed: 2,250
mm10.actb_ssdna_enriched_g4_and_sidd.slop200.bed: 2,534
mm10.actb_ssdna_enriched_g4_and_sidd.slop500.bed: 3,089
mm10.actb_ssdna_enriched_h-dna_and_sidd.bed: 1,627
mm10.actb_ssdna_enriched_h-dna_and_sidd.slop100.bed: 2,016
mm10.actb_ssdna_enriched_h-dna_and_sidd.slop200.bed: 2,249
mm10.actb_ssdna_enriched_h-dna_and_sidd.slop500.bed: 2,560


## Fasta

In [13]:
"""Calculate fasta files."""

!rm -r {F_GENOME}.fai
!rm -r {D_FLIPONS_FA} ; mkdir -p {D_FLIPONS_FA}

for file in D_FLIPONS_BED.iterdir():
    path_fa = D_FLIPONS_FA / file.name.replace('bed', 'fa')
    with open(path_fa, 'w') as f_out:
        sp.run(["bedtools", "getfasta", "-fi", F_GENOME, "-bed", file], check=True, stdout=f_out)

index file /home/fpavlov/projects/article_conserved_miRNA/data/genome/mm10.fa.fai not found, generating...


## Map flipons to miRNA

In [14]:
@dataclass
class Data:
    name: str
    path_fa: str
    path_bed: str
    path_bed_200: str
    shape: int = None
    path_ccre_intersection: str = None
    path_rmsk_intersection: str = None

    def __post_init__(self):
        self.shape = self._get_shape()
        self.path_ccre_intersection = D_FLIPONS_BED / self.path_bed_200.name.replace('.bed', '_and_ccre.bed')
        self.path_rmsk_intersection = D_FLIPONS_BED / self.path_bed_200.name.replace('.bed', '_and_rmsk.bed')

    def _get_shape(self):
        with open(self.path_bed, "r") as f_in:
            return sum(1 for _ in f_in)


flipon_data = [
    Data(
        "g4",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_g4.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_g4.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_g4.slop200.bed",
    ),
    Data(
        "sidd",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_sidd.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_sidd.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_sidd.slop200.bed",
    ),
    Data(
        "z-dna",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_z-dna.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_z-dna.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_z-dna.slop200.bed",
    ),
    Data(
        "h-dna",
        D_FLIPONS_FA / "mm10.actb_ssdna_enriched_h-dna.fa",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_h-dna.bed",
        D_FLIPONS_BED / "mm10.actb_ssdna_enriched_h-dna.slop200.bed",
    ),
]


In [15]:
def format_mirna_counts(mirna_counts: list):
    """Count each miRNA occurrence for each region."""
    res = []
    for region in mirna_counts:
        temp_res = []
        for mirna_family, counts in region.items():
            temp_res.append(f"{mirna_family} ({counts:,d})")
        res.append(', '.join(sorted(temp_res)))
    return res

def reverse_complement(x: str):
    t_d = {"A": "T", "C": "G", "T": "A", "G": "C", "N": "N"}
    return "".join([t_d[y] for y in x[::-1]])

In [16]:
kwargs = {
    "Seed region": lambda x: x["Seed region"].str.replace("U", "T").str.split(", "),
    "Seed region (RC)": lambda x: x["Seed region"].apply(
        lambda x: [reverse_complement(y) for y in x]
    ),
}
mirna_df = pd.read_table(F_MIRNA_LIST).assign(**kwargs)

print(mirna_df.shape)
mirna_df.iloc[10:20, :]


(177, 5)


Unnamed: 0,Conservation,miRNA gene family,# of genes,Seed region,Seed region (RC)
10,Bilateria,miR-34/449,6,[GGCAGTG],[CACTGCC]
11,Bilateria,miR-96/1271,2,[TTGGCAC],[GTGCCAA]
12,Bilateria,miR-99/100,3,[ACCCGTA],[TACGGGT]
13,Bilateria,miR-124,3,"[AAGGCAC, TAAGGCA]","[GTGCCTT, TGCCTTA]"
14,Bilateria,miR-125,3,[CCCTGAG],[CTCAGGG]
15,Bilateria,miR-133,3,"[TGGTCCC, TTGGTCC]","[GGGACCA, GGACCAA]"
16,Bilateria,miR-153,2,[TGCATAG],[CTATGCA]
17,Bilateria,miR-183,1,"[ATGGCAC, TGGCACT]","[GTGCCAT, AGTGCCA]"
18,Bilateria,miR-184,1,[GGACGGA],[TCCGTCC]
19,Bilateria,miR-190,2,[GATATGT],[ACATATC]


In [17]:
flipon_to_mirna_df = pd.DataFrame()

for flipon in tqdm(flipon_data):
    print(f"{flipon.name}: {flipon.shape:,d} regions")

    mirna_list_fwd = [{} for _ in range(flipon.shape)]
    mirna_list_rcm = [{} for _ in range(flipon.shape)]

    fasta = list(SeqIO.parse(flipon.path_fa, "fasta"))
    fasta_fwd_seq = [x.seq.upper() for x in fasta]

    for j, rec_fwd in enumerate(fasta_fwd_seq):
        counter_fwd = Counter()
        counter_rcm = Counter()
        for mirna_family, motiffs_f, motiffs_rc in mirna_df[
            ["miRNA gene family", "Seed region", "Seed region (RC)"]
        ].values:
            for motiff_fwd, motiff_rcm in zip(motiffs_f, motiffs_rc):
                counter_fwd[mirna_family] += rec_fwd.count(motiff_fwd)
                counter_rcm[mirna_family] += rec_fwd.count(motiff_rcm)
        mirna_list_fwd[j] = +counter_fwd
        mirna_list_rcm[j] = +counter_rcm

    kwargs = {
        "miRNA (intersection)": lambda x: x[["miRNA (+)", "miRNA (-)"]].apply(
            lambda y: ", ".join(sorted(set(y[0]) & set(y[1]))), axis=1
        ),
        "miRNA (+)": lambda x: format_mirna_counts(x["miRNA (+)"].values),
        "miRNA (-)": lambda x: format_mirna_counts(x["miRNA (-)"].values),
    }
    df = pd.DataFrame(
        {
            "Flipon": flipon.name,
            "Coordinates": [x.id for x in fasta],
            "miRNA (+)": mirna_list_fwd,
            "miRNA (-)": mirna_list_rcm,
        }
    ).assign(**kwargs)

    flipon_to_mirna_df = pd.concat(
        [flipon_to_mirna_df, df.sort_values("Coordinates")], ignore_index=True
    )

flipon_to_mirna_df.to_csv(F_FLIPON_TO_MIRNA, sep="\t", quoting=2, index=False)
flipon_to_mirna_df


  0%|          | 0/4 [00:00<?, ?it/s]

g4: 20,253 regions
sidd: 15,296 regions
z-dna: 25,059 regions
h-dna: 17,100 regions


Unnamed: 0,Flipon,Coordinates,miRNA (+),miRNA (-),miRNA (intersection)
0,g4,chr10:100015775-100015790,,,
1,g4,chr10:100016214-100016246,,miR-149 (1),
2,g4,chr10:10012216-10012266,,,
3,g4,chr10:100147225-100147247,miR-365 (1),,
4,g4,chr10:100401253-100401285,,,
...,...,...,...,...,...
77703,h-dna,chrY:4200098-4200122,,,
77704,h-dna,chrY:4208594-4208610,,,
77705,h-dna,chrY:4223501-4223522,,,
77706,h-dna,chrY:4231086-4231113,,,


In [18]:
"""Export mirna-enriched flipons."""

for flipon_type, group_df in flipon_to_mirna_df.groupby("Flipon"):
    (
        group_df.replace("", None)
        .dropna(subset=["miRNA (+)", "miRNA (-)"], how="all")["Coordinates"]
        .str.split(r":|-", regex=True, expand=True)
        .astype({1: int, 2: int})
        .sort_values([0, 1])
        .to_csv(
            D_FLIPONS_BED / f"mm10.actb_ssdna_enriched_{flipon_type}_and_mirna.bed",
            sep="\t",
            index=False,
            header=False,
        )
    )


## Map flipons to cCRE

In [19]:
for flipon in flipon_data:
    with open(flipon.path_ccre_intersection, "w") as f_out:
        sp.run(
            ["bedtools", "intersect", "-a", flipon.path_bed_200, "-b", F_CCRE, "-wo"],
            check=True,
            stdout=f_out,
        )
        shape = sp.run(
            f"wc -l {flipon.path_ccre_intersection}",
            shell=True,
            capture_output=True,
            encoding="utf-8",
        ).stdout
        print(f"{Path(shape.split()[1]).name}: {int(shape.split()[0]):,d}")


mm10.actb_ssdna_enriched_g4.slop200_and_ccre.bed: 14,637
mm10.actb_ssdna_enriched_sidd.slop200_and_ccre.bed: 1,683
mm10.actb_ssdna_enriched_z-dna.slop200_and_ccre.bed: 20,638
mm10.actb_ssdna_enriched_h-dna.slop200_and_ccre.bed: 3,756


In [20]:
flipon_to_ccre_df = pd.DataFrame()

for flipon in flipon_data:
    kwargs = {
        "Flipon": flipon.name,
        "cCRE (+-200bp)": lambda df: df.groupby([3])[9].transform(
            lambda x: ", ".join(x)
        ),
    }

    df = (
        pd.read_table(flipon.path_ccre_intersection, header=None)
        .assign(**kwargs)
        .drop_duplicates(subset=[3])[["Flipon", 3, "cCRE (+-200bp)"]]
        .rename(columns={3: "Coordinates"})
    )

    flipon_to_ccre_df = pd.concat([flipon_to_ccre_df, df], ignore_index=True)

flipon_to_ccre_df.to_csv(F_FLIPON_TO_CCRE, index=False, sep="\t")
flipon_to_ccre_df


Unnamed: 0,Flipon,Coordinates,cCRE (+-200bp)
0,g4,chr1:3671869-3671902,pELS (CTCF-bound)@EM10E0431220
1,g4,chr1:4493714-4493748,dELS@EM10E0431244
2,g4,chr1:4571896-4571924,dELS (CTCF-bound)@EM10E0431262
3,g4,chr1:5018367-5018390,"DNase-H3K4me3 (CTCF-bound)@EM10E0431331, PLS@E..."
4,g4,chr1:5019245-5019272,"pELS@EM10E0431333, PLS (CTCF-bound)@EM10E0431334"
...,...,...,...
24415,h-dna,chrX:152769666-152769687,pELS (CTCF-bound)@EM10E0930778
24416,h-dna,chrX:159987892-159987912,DNase-H3K4me3 (CTCF-bound)@EM10E0931180
24417,h-dna,chrX:161717977-161717993,"PLS@EM10E0931282, pELS@EM10E0931283"
24418,h-dna,chrX:162643118-162643148,"pELS@EM10E0931392, pELS@EM10E0931393, pELS (CT..."


In [21]:
"""Export ccre-enriched flipons."""

for flipon_type, group_df in flipon_to_ccre_df.groupby("Flipon"):
    (
        group_df.replace("", None)
        .dropna(subset=["cCRE (+-200bp)"])["Coordinates"]
        .str.split(r":|-", regex=True, expand=True)
        .astype({1: int, 2: int})
        .sort_values([0, 1])
        .to_csv(
            D_FLIPONS_BED / f"mm10.actb_ssdna_enriched_{flipon_type}_and_ccre.bed",
            sep="\t",
            index=False,
            header=False,
        )
    )


## Map flipons to LINE/LTRs

In [22]:
for flipon in flipon_data:
    with open(flipon.path_rmsk_intersection, "w") as f_out:
        sp.run(
            ["bedtools", "intersect", "-a", flipon.path_bed_200, "-b", F_RMSK_LINE_LTR, "-wo"],
            check=True,
            stdout=f_out,
        )
        shape = sp.run(
            f"wc -l {flipon.path_rmsk_intersection}",
            shell=True,
            capture_output=True,
            encoding="utf-8",
        ).stdout
        print(f"{Path(shape.split()[1]).name}: {int(shape.split()[0]):,d}")


mm10.actb_ssdna_enriched_g4.slop200_and_rmsk.bed: 13,336
mm10.actb_ssdna_enriched_sidd.slop200_and_rmsk.bed: 23,477
mm10.actb_ssdna_enriched_z-dna.slop200_and_rmsk.bed: 11,835
mm10.actb_ssdna_enriched_h-dna.slop200_and_rmsk.bed: 15,871


In [23]:
flipon_to_repeat_df = pd.DataFrame()

for flipon in flipon_data:
    kwargs = {
        "Flipon": flipon.name,
        "LINE/LTR (+-200bp)": lambda df: df.groupby([3])[9].transform(
            lambda x: ", ".join(x)
        ),
    }

    df = (
        pd.read_table(flipon.path_rmsk_intersection, header=None)
        .assign(**kwargs)
        .drop_duplicates(subset=[3])[["Flipon", 3, "LINE/LTR (+-200bp)"]]
        .rename(columns={3: "Coordinates"})
    )

    flipon_to_repeat_df = pd.concat([flipon_to_repeat_df, df], ignore_index=True)

flipon_to_repeat_df.to_csv(F_FLIPON_TO_RMSK, index=False, sep="\t")
flipon_to_repeat_df


Unnamed: 0,Flipon,Coordinates,LINE/LTR (+-200bp)
0,g4,chr1:3014794-3014871,"LINE@L1Md_F2(-), LINE@L1VL1(+)"
1,g4,chr1:3099888-3099963,"LINE@L1Md_F2(+), LTR@MTC(+)"
2,g4,chr1:3287445-3287468,LINE@L1MD(-)
3,g4,chr1:3472953-3472969,LINE@L1_Mus1(+)
4,g4,chr1:3535948-3535996,"LINE@L1_Mus2(-), LINE@L1VL1(+)"
...,...,...,...
39775,h-dna,chrY:4200098-4200122,LINE@L1MD(+)
39776,h-dna,chrY:4208594-4208610,"LTR@ORR1E(+), LINE@L1Md_T(-)"
39777,h-dna,chrY:4223501-4223522,LINE@L1_Mus2(-)
39778,h-dna,chrY:4231086-4231113,LTR@MuRRS4-int(-)


In [24]:
"""Export line/ltr-enriched flipons."""

for flipon_type, group_df in flipon_to_repeat_df.groupby("Flipon"):
    (
        group_df.replace("", None)
        .dropna(subset=["LINE/LTR (+-200bp)"])["Coordinates"]
        .str.split(r":|-", regex=True, expand=True)
        .astype({1: int, 2: int})
        .sort_values([0, 1])
        .to_csv(
            D_FLIPONS_BED / f"mm10.actb_ssdna_enriched_{flipon_type}_and_line_ltr.bed",
            sep="\t",
            index=False,
            header=False,
        )
    )


## Track names

In [25]:
import subprocess as sp

def get_color_by_name(name: str):
    if 'G4' in name:
        # orange
        return "245,139,0"
    if 'Z-DNA' in name:
        # blue
        return "0,112,245"
    if 'SIDD' in name:
        # red
        return "255,49,38"
    if 'H-DNA' in name:
        # purple
        return "159,75,201"

for file in D_FLIPONS_BED.iterdir():
    size = int(sp.run(f"wc -l {file}", shell=True, capture_output=True, encoding='utf-8').stdout.split(" ")[0])
    name = (
        file.name.split("enriched_")[1]
        .replace(".bed", "")
        .replace("_", " ")
        .replace("and", "&")
        .replace(".slop", "±")
        .upper()
        .replace("MIRNA", "miRNA")
        .replace("LINE ", "LINE/")
        .replace("CCRE", "cCRE")
    )
    description = "Kouzine experimental " + name.split(" ")[0]
    if len(name.split(" & ")) > 1:
        description += " and " + name.split(" & ")[1].replace(
            "miRNA", "conserved miRNA"
        ).replace("/LTR", "/LTR repeats")
    description += f" ({size:,d})"
    color = get_color_by_name(name.split(' & ')[0])

    track_info = f'track name="{name}" description="{description}" color={color}'

    !printf '%s\n%s\n' '{track_info}' "$(cat {file})" > {file}
