## Imports

In [None]:
!pip install biopython loguru

In [None]:
import subprocess
import numpy as np
import pandas as pd

from Bio import SeqIO
from loguru import logger
from tqdm.notebook import tqdm
from collections import Counter

## Data

In [None]:
"""Initialize miRNA seed regions."""

mirna_dict = {
  'let-7/miR-98':['GAGGUAG'],
  'miR-1/206':['GGAAUGU'],
  'miR-7':['GGAAGAC'],
  'miR-9':['CUUUGGU'],
  'miR-10':['ACCCUGU'],
  'miR-22':['AGCUGCC'],
  'miR-25/32/92/363/367':['AUUGCAC'],
  'miR-29':['AGCACCA'],
  'miR-31':['GGCAAGA'],
  'miR-33':['UGCAUUG'],
  'miR-34/449':['GGCAGUG'],
  'miR-96/1271':['UUGGCAC'],
  'miR-99/100':['ACCCGUA'],
  'miR-124':['AAGGCAC','UAAGGCA'],
  'miR-125':['CCCUGAG'],
  'miR-133':['UGGUCCC','UUGGUCC'],
  'miR-153':['UGCAUAG'],
  'miR-183':['AUGGCAC','UGGCACU'],
  'miR-184':['GGACGGA'],
  'miR-190':['GAUAUGU'],
  'miR-193':['GGGUCUU','ACUGGCC'],
  'miR-200bc/429':['AAUACUG'],
  'miR-210':['UGUGCGU'],
  'miR-216a':['AAUCUCA'],
  'miR-219':['GAUUGUC'],
  'miR-365':['AAUGCCC'],
  'miR-375':['UUGUUCG'],
  'miR-103/107':['GCAGCAU'],
  'miR-129':['UUUUUGC','AGCCCUU'],
  'miR-135':['AUGGCUU'],
  'miR-182':['UUGGCAA'],
  'miR-217':['ACUGCAU'],
  'miR-15/16/195/424/497':['AGCAGCA'],
  'miR-126':['CGUACCG','GUACCGU'],
  'miR-196':['AGGUAGU'],
  'miR-17/20/93/106/519d':['AAAGUGC'],
  'miR-18':['AAGGUGC'],
  'miR-19':['GUGCAAA'],
  'miR-21/590':['AGCUUAU'],
  'miR-23':['UCACAUU'],
  'miR-24':['GGCUCAG'],
  'miR-26':['UCAAGUA'],
  'miR-27':['UCACAGU'],
  'miR-30':['GUAAACA'],
  'miR-122':['GGAGUGU'],
  'miR-128':['CACAGUG'],
  'miR-130/301/454':['AGUGCAA'],
  'miR-132/212':['AACAGUC'],
  'miR-137':['UAUUGCU'],
  'miR-138':['GCUGGUG'],
  'miR-140':['AGUGGUU','CCACAGG','ACCACAG'],
  'miR-141/200a':['AACACUG'],
  'miR-142':['AUAAAGU','GUAGUGU','UAGUGUU'],
  'miR-143':['GAGAUGA'],
  'miR-144':['ACAGUAU'],
  'miR-145':['UCCAGUU'],
  'miR-146':['GAGAACU'],
  'miR-147':['UGUGCGG'],
  'miR-148/152':['CAGUGCA'],
  'miR-155':['UAAUGCU'],
  'miR-181':['ACAUUCA'],
  'miR-192/215':['UGACCUA'],
  'miR-194':['GUAACAG'],
  'miR-199':['CCAGUGU','CAGUAGU'],
  'miR-202':['UCCUAUG'],
  'miR-203a':['UGAAAUG','GAAAUGU'],
  'miR-204/211':['UCCCUUU'],
  'miR-205':['CCUUCAU'],
  'miR-208':['UAAGACG'],
  'miR-216b':['AAUCUCU'],
  'miR-218':['UGUGCUU'],
  'miR-221/222':['GCUACAU'],
  'miR-302abd':['AAGUGCU'],
  'miR-302c':['AGUGCUU'],
  'miR-338':['CCAGCAU'],
  'miR-455':['AUGUGCC','CAGUCCA','UGCAGUC'],
  'miR-499':['UAAGACU'],
  'miR-551':['CGACCCA'],
  'miR-802':['CAGUAAC'],
  'miR-1306':['CACCUCC'],
  'miR-101':['ACAGUAC','UACAGUA'],
  'miR-139':['CUACAGU'],
  'miR-150':['CUCCCAA'],
  'miR-191':['AACGGAA'],
  'miR-214':['GCCUGUC'],
  'miR-223':['GUCAGUU'],
  'miR-425':['AUGACAC'],
  'miR-187':['CGUGUCU'],
  'miR-489':['UGACAUC'],
  'miR-490':['AACCUGG'],
  'miR-383':['GAUCAGA','AGAUCAG'],
  'miR-186':['AAAGAAU'],
  'miR-325':['UUAUUGA'],
  'miR-873':['CAGGAAC','GCAGGAA'],
  'miR-340':['UAUAAAG'],
  'miR-1251':['CUCUAGC'],
  'miR-28/708':['AGGAGCU'],
  'miR-127':['CGGAUCC'],
  'miR-134':['GUGACUG'],
  'miR-136':['CUCCAUU'],
  'miR-149':['CUGGCUC'],
  'miR-151a':['CGAGGAG','UAGACUG'],
  'miR-154':['AGGUUAU','AUCAUAC'],
  'miR-185':['GGAGAGA'],
  'miR-188':['AUCCCUU'],
  'miR-224':['AAGUCAC'],
  'miR-296':['GGGCCCC','AGGGUUG'],
  'miR-299':['GGUUUAC','AUGUGGG'],
  'miR-323a':['ACAUUAC'],
  'miR-324':['GCAUCCC'],
  'miR-328':['UGGCCCU'],
  'miR-329/362':['AUCCUUG','ACACACC'],
  'miR-330':['CUCUGGG','CAAAGCA','AAAGCAC'],
  'miR-331':['CCCCUGG'],
  'miR-335':['CAAGAGC'],
  'miR-339':['CCCUGUC'],
  'miR-342':['CUCACAC'],
  'miR-346':['GUCUGCC'],
  'miR-361':['UAUCAGA'],
  'miR-369':['AUAAUAC'],
  'miR-371 (290)':['CUCAAAC'],
  'miR-374':['UAUAAUA'],
  'miR-376ab':['UCAUAGA'],
  'miR-376c':['ACAUAGA'],
  'miR-377':['UCACACA'],
  'miR-378a':['CUGGACU'],
  'miR-379':['GGUAGAC','AUGUAAC'],
  'miR-381':['AUACAAG'],
  'miR-382':['AAGUUGU','AUCAUUC'],
  'miR-409':['GGUUACC','AAUGUUG'],
  'miR-410':['AUAUAAC'],
  'miR-411':['AGUAGAC','UAGUAGA'],
  'miR-412':['GGUCGAC'],
  'miR-421':['UCAACAG'],
  'miR-423':['GAGGGGC','GCUCGGU'],
  'miR-431':['GUCUUGC'],
  'miR-433':['UCAUGAU'],
  'miR-448':['UGCAUAU'],
  'miR-450a':['UUUGCGA'],
  'miR-452':['ACUGUUU'],
  'miR-483':['ACUCCUC','CACUCCU'],
  'miR-485':['GAGGCUG'],
  'miR-486':['CCUGUAC'],
  'miR-487b':['AUCGUAC'],
  'miR-488':['UGAAAGG'],
  'miR-491':['GUGGGGA'],
  'miR-493':['UGUACAU','GAAGGUC'],
  'miR-494':['GAAACAU'],
  'miR-495':['AACAAAC'],
  'miR-496':['GAGUAUU','GUAUUAC'],
  'miR-501/502':['AUGCACC'],
  'miR-503':['AGCAGCG'],
  'miR-504':['ACCCUGG'],
  'miR-505':['GUCAACA','UCAACAC'],
  'miR-532':['AUGCCUU','CUCCCAC'],
  'miR-539':['UCAUACA'],
  'miR-542':['GUGACAG'],
  'miR-543':['AACAUUC'],
  'miR-544a':['CUUGUUA'],
  'miR-582':['UACAGUU'],
  'miR-615':['CCGAGCC'],
  'miR-652':['AUGGCGC'],
  'miR-653':['UGAAACA'],
  'miR-655':['UAAUACA'],
  'miR-665':['CCAGGAG'],
  'miR-668':['GUCACUC'],
  'miR-670':['UUCCUCA'],
  'miR-744':['GCGGGGC'],
  'miR-758':['UUGUGAC'],
  'miR-760':['GGCUCUG'],
  'miR-874':['UGCCCUG'],
  'miR-875':['AUACCUC'],
  'miR-876':['GGAUUUC'],
  'miR-1193':['AGGUCAC'],
  'miR-1197':['AGGACAC'],
  'miR-1249':['CGCCCUU'],
  'miR-1298':['UCAUUCG'],
}

for k, v in mirna_dict.items():
  for idx, el in enumerate(v):
    mirna_dict[k][idx] = el.replace('U', 'T')

## Overlaps between Kouzine peaks and miRNA

In [None]:
df      = pd.DataFrame()
df_more = pd.DataFrame()

kouzine_data = [
  ('z-dna',       './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna.fa'),
  ('quadruplex',  './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex.fa'),
  ('sidd',        './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.fa'),
  ('h-dna',       './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna.fa'),
]

for name, path in tqdm(kouzine_data):
  shape = subprocess.run(["wc", "-l", path], check=True, capture_output=True, encoding='UTF-8')
  shape = int(int(shape.stdout.split(' ')[0]) / 2)
  logger.info(f'{name}: {shape:,d} regions')

  name100 = name +'.slop100'
  name200 = name +'.slop200'

  path100 = path.replace(name, name100)
  path200 = path.replace(name, name200)

  fasta     = list(SeqIO.parse(path, "fasta"))
  fasta100  = list(SeqIO.parse(path100, "fasta"))
  fasta200  = list(SeqIO.parse(path200, "fasta"))

  peak_count    = Counter()
  peak_count100 = Counter()
  peak_count200 = Counter()

  mirna_count     = np.zeros((len(mirna_dict), shape))
  mirna_count100  = np.zeros((len(mirna_dict), shape))
  mirna_count200  = np.zeros((len(mirna_dict), shape))

  for i, (mirna, motiff_list) in enumerate(tqdm(mirna_dict.items(), leave=False)):
    for j, (rec, rec100, rec200) in enumerate(zip(fasta, fasta100, fasta200)):
      peak_count[mirna]    += any(motiff in rec.seq.upper() for motiff in motiff_list)
      peak_count100[mirna] += any(motiff in rec100.seq.upper() for motiff in motiff_list)
      peak_count200[mirna] += any(motiff in rec200.seq.upper() for motiff in motiff_list)

      mirna_count[i][j]    += sum(rec.seq.upper().count(motiff) for motiff in motiff_list)
      mirna_count100[i][j] += sum(rec100.seq.upper().count(motiff) for motiff in motiff_list)
      mirna_count200[i][j] += sum(rec200.seq.upper().count(motiff) for motiff in motiff_list)

  # Merge values for the resulting dataframe `df`
  cur_df    = pd.DataFrame([peak_count]).T
  cur_df100 = pd.DataFrame([peak_count100]).T
  cur_df200 = pd.DataFrame([peak_count200]).T

  cur_df_concat    = pd.concat([cur_df, cur_df100, cur_df200], axis=1, ignore_index=True)
  cur_df_concat[3] = cur_df_concat[0]/shape*100
  cur_df_concat[4] = cur_df_concat[1]/shape*100
  cur_df_concat[5] = cur_df_concat[2]/shape*100
  cur_df_concat.columns = [name+'_#', name100+'_#', name200+'_#',
                           name+'_%',  name100+'_%', name200+'_%']

  df  = pd.concat([df, cur_df_concat], axis=1)

  # Merge values for the resulting dataframe `df_more`
  more100 = (mirna_count100 > mirna_count).sum(axis=1)
  more200 = (mirna_count200 > mirna_count100).sum(axis=1)

  cur_df_more     = pd.DataFrame([more100, more200]).T
  cur_df_more[2]  = cur_df_more[0]/shape*100
  cur_df_more[3]  = cur_df_more[1]/shape*100
  cur_df_more.columns = [name+'_slop100>slop_#', name+'_slop200>slop100_#',
                         name+'_slop100>slop_%', name+'_slop200>slop100_%']

  df_more = pd.concat([df_more, cur_df_more], axis=1)

df.to_csv('kouzine_mirna_overlaps.tsv', sep='\t')
df_more.to_csv('kouzine_mirna_overlap_increase.tsv', sep='\t')

  0%|          | 0/4 [00:00<?, ?it/s]

2022-05-12 15:54:34.836 | INFO     | __main__:<module>:14 - z-dna: 25,059 regions


  0%|          | 0/177 [00:00<?, ?it/s]

2022-05-12 15:57:14.294 | INFO     | __main__:<module>:14 - quadruplex: 20,253 regions


  0%|          | 0/177 [00:00<?, ?it/s]

2022-05-12 15:59:15.513 | INFO     | __main__:<module>:14 - sidd: 15,296 regions


  0%|          | 0/177 [00:00<?, ?it/s]

2022-05-12 16:00:56.294 | INFO     | __main__:<module>:14 - h-dna: 17,100 regions


  0%|          | 0/177 [00:00<?, ?it/s]

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(df.fillna(0))
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 20)

Unnamed: 0,z-dna_#,z-dna.slop100_#,z-dna.slop200_#,z-dna_%,z-dna.slop100_%,z-dna.slop200_%,quadruplex_#,quadruplex.slop100_#,quadruplex.slop200_#,quadruplex_%,quadruplex.slop100_%,quadruplex.slop200_%,sidd_#,sidd.slop100_#,sidd.slop200_#,sidd_%,sidd.slop100_%,sidd.slop200_%,h-dna_#,h-dna.slop100_#,h-dna.slop200_#,h-dna_%,h-dna.slop100_%,h-dna.slop200_%
let-7/miR-98,0,205,449,0.0,0.818069,1.791771,8,264,503,0.0395,1.303511,2.483583,57,258,403,0.372646,1.686715,2.634676,26,311,536,0.152047,1.818713,3.134503
miR-1/206,1,172,382,0.003991,0.68638,1.524402,16,253,437,0.079001,1.249198,2.157705,139,386,519,0.908734,2.523536,3.393044,0,138,328,0.0,0.807018,1.918129
miR-7,0,261,603,0.0,1.041542,2.406321,9,250,516,0.044438,1.234385,2.547771,59,273,507,0.385722,1.78478,3.314592,5,237,506,0.02924,1.385965,2.959064
miR-9,0,195,471,0.0,0.778164,1.879564,1,273,841,0.004938,1.347948,4.152471,136,436,1129,0.889121,2.850418,7.381015,2,187,532,0.011696,1.093567,3.111111
miR-10,2,247,543,0.007981,0.985674,2.166886,36,367,726,0.177751,1.812077,3.584654,53,306,515,0.346496,2.000523,3.366893,0,160,383,0.0,0.935673,2.239766
miR-22,0,636,1285,0.0,2.53801,5.127898,26,570,1078,0.128376,2.814398,5.322668,63,261,571,0.411872,1.706328,3.733002,0,201,473,0.0,1.175439,2.766082
miR-25/32/92/363/367,11,92,214,0.043896,0.367134,0.853985,0,57,148,0.0,0.28144,0.730756,49,154,236,0.320345,1.006799,1.542887,0,36,118,0.0,0.210526,0.690058
miR-29,0,494,1032,0.0,1.971348,4.118281,1,265,608,0.004938,1.308448,3.002024,78,326,577,0.509937,2.131276,3.772228,0,163,385,0.0,0.953216,2.251462
miR-31,2,283,590,0.007981,1.129335,2.354444,17,331,590,0.083938,1.634326,2.913149,63,294,459,0.411872,1.922071,3.000785,3,303,566,0.017544,1.77193,3.309942
miR-33,1,170,329,0.003991,0.678399,1.312902,2,116,264,0.009875,0.572755,1.303511,107,247,376,0.699529,1.614801,2.458159,0,91,226,0.0,0.532164,1.321637


In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(df_more.fillna(0))
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 20)

Unnamed: 0,z-dna_slop100>slop_#,z-dna_slop200>slop100_#,z-dna_slop100>slop_%,z-dna_slop200>slop100_%,quadruplex_slop100>slop_#,quadruplex_slop200>slop100_#,quadruplex_slop100>slop_%,quadruplex_slop200>slop100_%,sidd_slop100>slop_#,sidd_slop200>slop100_#,sidd_slop100>slop_%,sidd_slop200>slop100_%,h-dna_slop100>slop_#,h-dna_slop200>slop100_#,h-dna_slop100>slop_%,h-dna_slop200>slop100_%
0,205,245,0.818069,0.977693,256,256,1.26401,1.26401,201,149,1.314069,0.974111,294,257,1.719298,1.502924
1,171,212,0.68239,0.846003,237,185,1.170197,0.913445,248,138,1.621339,0.902197,138,192,0.807018,1.122807
2,261,346,1.041542,1.380741,241,268,1.189947,1.323261,215,240,1.405596,1.569038,234,278,1.368421,1.625731
3,195,279,0.778164,1.113372,272,578,1.343011,2.853898,302,712,1.974372,4.654812,186,351,1.087719,2.052632
4,245,302,0.977693,1.205156,332,363,1.639263,1.792327,253,217,1.654027,1.418672,160,226,0.935673,1.321637
5,636,674,2.53801,2.689652,545,535,2.690959,2.641584,198,324,1.294456,2.118201,201,279,1.175439,1.631579
6,81,124,0.323237,0.494832,57,92,0.28144,0.454254,105,83,0.686454,0.542626,36,82,0.210526,0.479532
7,494,562,1.971348,2.242707,264,355,1.303511,1.752827,248,261,1.621339,1.706328,163,225,0.953216,1.315789
8,281,317,1.121354,1.265015,315,272,1.555325,1.343011,233,169,1.523274,1.104864,301,283,1.760234,1.654971
9,169,165,0.674408,0.658446,114,150,0.56288,0.740631,140,130,0.915272,0.849895,91,135,0.532164,0.789474


## Count miRNA families

In [None]:
def create_dataframe(mirna_count: np.array, name: str):
  """Transform count matrix into a dataframe."""

  d = make_d(mirna_count, name)
  cur_df = pd.DataFrame([d], index=[name])
  cur_df['total'] = cur_df.sum(axis=1)
  return cur_df

def make_d(mirna_count: np.array, name: str):
  """Calculate values from a given count matrix to compute defined miRNA groupings."""
  temp_d        = dict()
  df_count_temp = pd.DataFrame(mirna_count)
  overall_sum   = (mirna_count).sum(axis=0)

  temp_d['no_mirna']                  = (overall_sum==0).sum()
  temp_d['single_mirna']              = (overall_sum==1).sum()
  temp_d['single_but_multiple_times'] = ((df_count_temp.nunique()==2) * (overall_sum>1)).sum()
  temp_d['multiple_types']            = (df_count_temp.nunique()>2).sum()

  return temp_d


kouzine_data = [
  ('z-dna',       './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna.fa'),
  ('sidd',        './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.fa'),
  ('quadruplex',  './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex.fa'),
  ('h-dna',       './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna.fa'),

  ('z-dna_and_sidd_slop100',      './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd_slop100.fa'),
  ('quadruplex_and_sidd_slop100', './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd_slop100.fa'),
  ('h-dna_and_sidd_slop100',      './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd_slop100.fa'),

  ('z-dna_and_sidd_slop200',      './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd_slop200.fa'),
  ('quadruplex_and_sidd_slop200', './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd_slop200.fa'),
  ('h-dna_and_sidd_slop200',      './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd_slop200.fa'),

  ('z-dna_and_sidd_slop500',      './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna_and_sidd_slop500.fa'),
  ('quadruplex_and_sidd_slop500', './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex_and_sidd_slop500.fa'),
  ('h-dna_and_sidd_slop500',      './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna_and_sidd_slop500.fa'),
]

df, df100, df200, df500 = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

for name, path in tqdm(kouzine_data):
  shape = subprocess.run(["wc", "-l", path], check=True, capture_output=True, encoding='UTF-8')
  shape = int(int(shape.stdout.split(' ')[0]) / 2)
  logger.info(f'{name}: {shape:,d} regions')

  name100 = name+'.slop100'
  name200 = name+'.slop200'
  name500 = name+'.slop500'

  path100 = path.replace(name, name100)
  path200 = path.replace(name, name200)
  path500 = path.replace(name, name500)

  fasta     = list(SeqIO.parse(path, "fasta"))
  fasta100  = list(SeqIO.parse(path100, "fasta"))
  fasta200  = list(SeqIO.parse(path200, "fasta"))
  fasta500  = list(SeqIO.parse(path500, "fasta"))

  mirna_count     = np.zeros((len(mirna_dict), shape))
  mirna_count100  = np.zeros((len(mirna_dict), shape))
  mirna_count200  = np.zeros((len(mirna_dict), shape))
  mirna_count500  = np.zeros((len(mirna_dict), shape))

  for j, (rec, rec100, rec200, rec500) in enumerate(tqdm(zip(fasta, fasta100, fasta200, fasta500), total=shape, leave=False)):
    for i, (mirna, motiff_list) in enumerate(mirna_dict.items()):
      mirna_count[i][j]     = sum(rec.seq.upper().count(motiff) for motiff in motiff_list)
      mirna_count100[i][j]  = sum(rec100.seq.upper().count(motiff) for motiff in motiff_list)
      mirna_count200[i][j]  = sum(rec200.seq.upper().count(motiff) for motiff in motiff_list)
      mirna_count500[i][j]  = sum(rec500.seq.upper().count(motiff) for motiff in motiff_list)
  
  df    = pd.concat([df, create_dataframe(mirna_count, name)], axis=0)
  df100 = pd.concat([df100, create_dataframe(mirna_count100, name)], axis=0)
  df200 = pd.concat([df200, create_dataframe(mirna_count200, name)], axis=0)
  df500 = pd.concat([df500, create_dataframe(mirna_count500, name)], axis=0)

res = pd.concat([df, df100, df200, df500], axis=1)
res.to_csv('mirna_families.tsv', sep='\t')

  0%|          | 0/13 [00:00<?, ?it/s]

2022-05-12 16:18:00.311 | INFO     | __main__:<module>:47 - z-dna: 25,059 regions


  0%|          | 0/25059 [00:00<?, ?it/s]

2022-05-12 16:20:19.018 | INFO     | __main__:<module>:47 - sidd: 15,296 regions


  0%|          | 0/15296 [00:00<?, ?it/s]

2022-05-12 16:21:50.334 | INFO     | __main__:<module>:47 - quadruplex: 20,253 regions


  0%|          | 0/20253 [00:00<?, ?it/s]

2022-05-12 16:23:42.044 | INFO     | __main__:<module>:47 - h-dna: 17,100 regions


  0%|          | 0/17100 [00:00<?, ?it/s]

2022-05-12 16:25:15.382 | INFO     | __main__:<module>:47 - z-dna_and_sidd_slop100: 758 regions


  0%|          | 0/758 [00:00<?, ?it/s]

2022-05-12 16:25:19.791 | INFO     | __main__:<module>:47 - quadruplex_and_sidd_slop100: 2,250 regions


  0%|          | 0/2250 [00:00<?, ?it/s]

2022-05-12 16:25:32.580 | INFO     | __main__:<module>:47 - h-dna_and_sidd_slop100: 2,016 regions


  0%|          | 0/2016 [00:00<?, ?it/s]

2022-05-12 16:25:44.019 | INFO     | __main__:<module>:47 - z-dna_and_sidd_slop200: 1,117 regions


  0%|          | 0/1117 [00:00<?, ?it/s]

2022-05-12 16:25:50.314 | INFO     | __main__:<module>:47 - quadruplex_and_sidd_slop200: 2,534 regions


  0%|          | 0/2534 [00:00<?, ?it/s]

2022-05-12 16:26:04.616 | INFO     | __main__:<module>:47 - h-dna_and_sidd_slop200: 2,249 regions


  0%|          | 0/2249 [00:00<?, ?it/s]

2022-05-12 16:26:17.265 | INFO     | __main__:<module>:47 - z-dna_and_sidd_slop500: 1,802 regions


  0%|          | 0/1802 [00:00<?, ?it/s]

2022-05-12 16:26:27.440 | INFO     | __main__:<module>:47 - quadruplex_and_sidd_slop500: 3,089 regions


  0%|          | 0/3089 [00:00<?, ?it/s]

2022-05-12 16:26:44.785 | INFO     | __main__:<module>:47 - h-dna_and_sidd_slop500: 2,560 regions


  0%|          | 0/2560 [00:00<?, ?it/s]

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(res.fillna(0))
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', 20)

Unnamed: 0,no_mirna,single_mirna,single_but_multiple_times,multiple_types,total,no_mirna.1,single_mirna.1,single_but_multiple_times.1,multiple_types.1,total.1,no_mirna.2,single_mirna.2,single_but_multiple_times.2,multiple_types.2,total.2,no_mirna.3,single_mirna.3,single_but_multiple_times.3,multiple_types.3,total.3
z-dna,21668,2616,730,45,25059,2262,3936,14795,4066,25059,444,869,15131,8615,25059,55,26,6289,18689,25059
sidd,3153,4117,6273,1753,15296,135,520,10354,4287,15296,10,35,8762,6489,15296,1,0,2977,12318,15296
quadruplex,15684,3736,770,63,20253,1031,2372,12984,3866,20253,161,359,12269,7464,20253,11,14,4676,15552,20253
h-dna,15626,1066,395,13,17100,2687,3117,8549,2747,17100,670,923,9800,5707,17100,43,44,4715,12298,17100
z-dna_and_sidd_slop100,620,95,41,2,758,58,116,441,143,758,3,20,472,263,758,0,0,209,549,758
quadruplex_and_sidd_slop100,1765,424,59,2,2250,54,197,1439,560,2250,4,11,1311,924,2250,0,0,508,1742,2250
h-dna_and_sidd_slop100,1945,65,5,1,2016,173,372,1103,368,2016,14,41,1242,719,2016,0,0,494,1522,2016
z-dna_and_sidd_slop200,923,140,52,2,1117,97,171,646,203,1117,10,33,692,382,1117,0,0,303,814,1117
quadruplex_and_sidd_slop200,1995,471,65,3,2534,71,238,1617,608,2534,4,22,1488,1020,2534,0,0,581,1953,2534
h-dna_and_sidd_slop200,2158,79,10,2,2249,217,426,1200,406,2249,20,62,1381,786,2249,0,0,569,1680,2249


## flipons that overlap miRNA

In [None]:
kouzine_data = [
  ('z-dna',       './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_z-dna.fa'),
  ('quadruplex',  './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_quadruplex.fa'),
  ('sidd',        './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_sidd.fa'),
  ('h-dna',       './mm10_kouzine_ssDNA_fa/mm10_kouzine_actb_ssdna_enriched_h-dna.fa'),
]

for name, path in tqdm(kouzine_data):
  shape = subprocess.run(["wc", "-l", path], check=True, capture_output=True, encoding='UTF-8')
  shape = int(int(shape.stdout.split(' ')[0]) / 2)
  logger.info(f'{name}: {shape:,d} regions')

  fasta     = list(SeqIO.parse(path, "fasta"))

  flags = np.full(shape, False)
  for j, rec in enumerate(tqdm(fasta, leave=False)):
    for mirna, motiff_list in mirna_dict.items():
      if any(motiff in rec.seq.upper() for motiff in motiff_list):
        flags[j] = True

  logger.debug(flags.sum())
        
  path_bed = path.replace('.fa', '.bed',).replace('_fa', '')
  t = pd.read_csv(path_bed, sep='\t', header=None, index_col=None)

  path_result = path_bed.replace('.bed', '_miRNA.bed')
  t[flags].to_csv(path_result, sep='\t', header=None, index=None)


  0%|          | 0/4 [00:00<?, ?it/s]

2022-05-12 16:27:27.634 | INFO     | __main__:<module>:11 - z-dna: 25,059 regions


  0%|          | 0/25059 [00:00<?, ?it/s]

2022-05-12 16:27:47.692 | DEBUG    | __main__:<module>:21 - 3391
2022-05-12 16:27:47.750 | INFO     | __main__:<module>:11 - quadruplex: 20,253 regions


  0%|          | 0/20253 [00:00<?, ?it/s]

2022-05-12 16:28:01.893 | DEBUG    | __main__:<module>:21 - 4569
2022-05-12 16:28:01.947 | INFO     | __main__:<module>:11 - sidd: 15,296 regions


  0%|          | 0/15296 [00:00<?, ?it/s]

2022-05-12 16:28:14.456 | DEBUG    | __main__:<module>:21 - 12143
2022-05-12 16:28:14.517 | INFO     | __main__:<module>:11 - h-dna: 17,100 regions


  0%|          | 0/17100 [00:00<?, ?it/s]

2022-05-12 16:28:26.520 | DEBUG    | __main__:<module>:21 - 1474
