## Misc

In [1]:
import os
import pandas as pd

In [2]:
d_root = '/home/fed/GitHub/article_conserved_miRNA/'
d_data = f'{d_root}/data'
d_utils = f'{d_root}/utils'

!mkdir -p {d_data} {d_utils}

# Input
f_liftover = f'{d_utils}/liftOver'
f_mm9tomm10chain = f'{d_data}/mm9ToMm10.over.chain.gz'
f_mm10_fa = f'{d_data}/mm10.fa'
f_mm10_chrom_sizes = f'{d_data}/mm10.chrom.sizes'


# Output
d_kouzine_bed = f'{d_data}/mm10_kouzine_ssDNA_bed'
d_kouzine_fa = f'{d_data}/mm10_kouzine_ssDNA_fa'

!mkdir -p {d_kouzine_bed} {d_kouzine_fa}

## Data

### Utilities and genome files

In [3]:
"""Get mm10 genome data."""

!wget -c -q -O {f_mm10_chrom_sizes} http://hgdownload.cse.ucsc.edu/goldenpath/mm10/bigZips/mm10.chrom.sizes
!wget -c -q -P {d_data} https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz
!gzip -dcf {d_data}/mm10.fa.gz > {f_mm10_fa}

In [4]:
"""Get liftOver tool files."""

!wget -c -q -O {f_liftover} http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
!wget -c -q -O {f_mm9tomm10chain} https://hgdownload.cse.ucsc.edu/goldenpath/mm9/liftOver/mm9ToMm10.over.chain.gz
!chmod +x {f_liftover}

### Kouzine peaks

In [5]:
"""Get Kouzine ssDNA data."""

!wget -c -q -P {d_data} https://www.ncbi.nlm.nih.gov/CBBresearch/Przytycka/software/nonbdna/nonB_DNA_ssDNA_enriched.tar

!rm -r {d_data}/nonB_DNA_ssDNA_enriched
!tar -xvf {d_data}/nonB_DNA_ssDNA_enriched.tar -C {d_data}

with os.scandir(f'{d_data}/nonB_DNA_ssDNA_enriched/mouse_mm9') as it:
    for entry in it:
        !gzip -df {entry.path}

nonB_DNA_ssDNA_enriched/
nonB_DNA_ssDNA_enriched/human_hg19/
nonB_DNA_ssDNA_enriched/human_hg19/Raji_ssDNA_enriched_H-DNA.bed.gz
nonB_DNA_ssDNA_enriched/human_hg19/Raji_ssDNA_enriched_Quadruplex.bed.gz
nonB_DNA_ssDNA_enriched/human_hg19/Raji_ssDNA_enriched_SIDD.bed.gz
nonB_DNA_ssDNA_enriched/human_hg19/Raji_ssDNA_enriched_Z-DNA.bed.gz
nonB_DNA_ssDNA_enriched/mouse_mm9/
nonB_DNA_ssDNA_enriched/mouse_mm9/actB_ssDNA_enriched_H-DNA.bed.gz
nonB_DNA_ssDNA_enriched/mouse_mm9/actB_ssDNA_enriched_Quadruplex.bed.gz
nonB_DNA_ssDNA_enriched/mouse_mm9/actB_ssDNA_enriched_SIDD.bed.gz
nonB_DNA_ssDNA_enriched/mouse_mm9/actB_ssDNA_enriched_Z-DNA.bed.gz


## Preprocessing

### Uplift Kouzine peaks

In [29]:
"""Uplift from mm9 to mm10."""

!rm -r {d_kouzine_bed} ; mkdir -p {d_kouzine_bed}

with os.scandir(f'{d_data}/nonB_DNA_ssDNA_enriched/mouse_mm9') as it:
    for entry in it:
        new_entry_path = f'{d_kouzine_bed}/mm10_kouzine_' + entry.name.lower()

        !tail -n +2 {entry.path} > temp && mv temp {entry.path}
        !{f_liftover} {entry.path} {f_mm9tomm10chain} {new_entry_path} unmapped_regions.bed
        !rm unmapped_regions.bed

Reading liftover chains
Mapping coordinates
Reading liftover chains
Mapping coordinates
Reading liftover chains
Mapping coordinates
Reading liftover chains
Mapping coordinates


### Slops

In [30]:
"""Calculate slops for each bedfile (required to find overlaps with miRNA)."""

slop_values = [100, 200, 500]
with os.scandir(d_kouzine_bed) as it:
    for entry in it:
        print(entry.path)
        (
            pd.read_table(entry.path, header=None)
            .iloc[:,:3]
            .assign(name=lambda x: x[0] + ":" + x[1].astype(str) + "-" + x[2].astype(str))
            .to_csv(entry.path, sep='\t', header=False, index=False)
        )
        for slop_val in slop_values:
            path_to_slop = entry.path.replace('.bed', f'.slop{slop_val}.bed')
            !bedtools slop -i {entry.path} -g {f_mm10_chrom_sizes} -b {slop_val} > {path_to_slop}

/home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_h-dna.bed
/home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex.bed
/home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_sidd.bed
/home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna.bed


### Intersections

In [35]:
"""Intersect other flipons with SIDD."""

kouzine_data = [
    f'{d_kouzine_bed}/mm10_kouzine_actb_ssdna_enriched_z-dna.bed',
    f'{d_kouzine_bed}/mm10_kouzine_actb_ssdna_enriched_quadruplex.bed',
    f'{d_kouzine_bed}/mm10_kouzine_actb_ssdna_enriched_h-dna.bed',
]

sidd_slops = [
    f'{d_kouzine_bed}/mm10_kouzine_actb_ssdna_enriched_sidd.bed',
    *[f'{d_kouzine_bed}/mm10_kouzine_actb_ssdna_enriched_sidd.slop{x}.bed' for x in slop_values]
]

for a in kouzine_data:
    for b in sidd_slops:
        intersection_path = a.replace('.bed', '_and' + b.split('enriched')[1].strip())
        !bedtools intersect -a {a} -b {b} -u > {intersection_path}
        !wc -l {intersection_path}

256 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.bed
757 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.slop100.bed
1116 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.slop200.bed
1801 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.slop500.bed
1374 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.bed
2250 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.slop100.bed
2534 /home/fed/GitHub/article_conserved_miRNA//data/mm10_kouzine_ssDNA_bed/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.slop200.bed
3089 /home/fed/GitHub/article_conserved_miRNA//data/mm10_k

## Fasta

In [37]:
"""Calculate fasta files."""

!rm -r {d_kouzine_fa} ; mkdir -p {d_kouzine_fa}
with os.scandir(d_kouzine_bed) as it:
    for entry in it:
        path_fa = entry.path.replace('.bed', '.fa').replace('ssDNA_bed', 'ssDNA_fa')
        print(path_fa)
        
        !bedtools getfasta -fi {f_mm10_fa} -bed {entry.path} > {path_fa}

index file /home/fed/GitHub/article_conserved_miRNA//data/mm10.fa.fai not found, generating...
