## Imports

In [None]:
!apt-get install bedtools
!pip install loguru biopython

In [2]:
import os
import pandas as pd

## Data

### Utilities and genome files (?)

In [None]:
"""Get mm10 genome data."""

!wget -c https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/mm10.fa.gz
!gzip -df mm10.fa.gz

!wget -c http://hgdownload.cse.ucsc.edu/goldenpath/mm10/bigZips/mm10.chrom.sizes

In [None]:
"""Get liftOver tool files."""

!wget -c http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
!wget -c https://hgdownload.cse.ucsc.edu/goldenpath/mm9/liftOver/mm9ToMm10.over.chain.gz
!chmod +x {'./liftOver'}

### Kouzine peaks

In [None]:
"""Get Kouzine ssDNA data."""

!wget -c https://www.ncbi.nlm.nih.gov/CBBresearch/Przytycka/software/nonbdna/nonB_DNA_ssDNA_enriched.tar

!rm -r nonB_DNA_ssDNA_enriched ; tar -xvf nonB_DNA_ssDNA_enriched.tar
with os.scandir('nonB_DNA_ssDNA_enriched/mouse_mm9') as it:
  for entry in it:
    !gzip -df {entry.path}

## Preprocessing

### Uplift Kouzine peaks

In [6]:
"""Uplift from mm9 to mm10."""

!rm -r mm10_kouzine_ssDNA ; mkdir -p mm10_kouzine_ssDNA
with os.scandir('nonB_DNA_ssDNA_enriched/mouse_mm9') as it:
  for entry in it:
    new_entry_path = 'mm10_kouzine_ssDNA/mm10_kouzine_' + entry.name.lower()

    !tail -n +2 {entry.path} > temp && mv temp {entry.path}
    !./liftOver {entry.path} mm9ToMm10.over.chain.gz {new_entry_path} unmapped_regions.bed

rm: cannot remove 'mm10_kouzine_ssDNA': No such file or directory
Reading liftover chains
Mapping coordinates
Reading liftover chains
Mapping coordinates
Reading liftover chains
Mapping coordinates
Reading liftover chains
Mapping coordinates


### SIDD slops

In [7]:
"""Calculate slops for SIDD."""

path_to_sidd = 'mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_sidd.bed'
slop_values = [100, 200, 500]

for slop_val in slop_values:
  slop_path = path_to_sidd.replace('.bed', f'_slop{slop_val}.bed')
  !bedtools slop -i {path_to_sidd} -g mm10.chrom.sizes -b {slop_val} > {slop_path}

### Intersections

In [8]:
"""Intersect other flipons with SIDD."""

kouzine_data = [
  ('z-dna', './mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_z-dna.bed'),
  ('g4',    './mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_quadruplex.bed'),
  ('h-dna', './mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_h-dna.bed'),
]

sidd_slops = [f'mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_sidd_slop{x}.bed' for x in slop_values]
sidd_slops.insert(0, 'mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_sidd.bed')

for _, a in kouzine_data:
  for i, b in enumerate(sidd_slops):
    intersection_path = a.replace('.bed', '_and'+b.split('enriched')[1].strip())
    !bedtools intersect -a {a} -b {b} -u > {intersection_path}
    !wc -l {intersection_path}

256 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd.bed
758 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd_slop100.bed
1117 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd_slop200.bed
1802 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd_slop500.bed
1374 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd.bed
2250 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd_slop100.bed
2534 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd_slop200.bed
3089 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd_slop500.bed
1627 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd.bed
2016 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd_slop100.bed
2249 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd_slop200.bed
2560 ./mm10_kouzine_ssDNA/mm10_kouzine_actb_ssdna_enriched_h

### Overall slops

In [9]:
"""Calculate slops for each bedfile (required to find overlaps with miRNA)."""

import os

with os.scandir('./mm10_kouzine_ssDNA') as it:
  for entry in it:
    for slop_val in slop_values:
      path_to_slop = entry.path.replace('.bed', f'.slop{slop_val}.bed')
      !bedtools slop -i {entry.path} -g mm10.chrom.sizes -b {slop_val} > {path_to_slop}