# Dinucleotide frequencies in reference genomes

In [57]:
pip install biopython -q

Note: you may need to restart the kernel to use updated packages.


In [56]:
pip install ensembl_rest -q

Note: you may need to restart the kernel to use updated packages.


In [1]:
from Bio import Entrez, SeqIO
import gzip
import requests
from tqdm import tqdm
import json
import tarfile

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd


%reload_ext autoreload
%autoreload 2
import dinucleo_freq as dnf

## Escherichia coli

In [3]:
#url="http://ftp.ensemblgenomes.org/pub/bacteria/release-52/fasta/bacteria_26_collection/escherichia_coli_w_gca_000184185/dna/Escherichia_coli_w_gca_000184185.ASM18418v1.dna.toplevel.fa.gz"
#response = requests.get(url)

e_coli_di_fr, e_coli_di_fr_th = dnf.obs_vs_theo_dinuc("Escherichia_coli_w_gca_000184185.ASM18418v1.dna.toplevel.fa.gz")

Analysing Escherichia_coli_w_gca_000184185 genome 

pRK1


100%|██████████████████████████████████████████████████████████████████████| 102535/102535 [00:00<00:00, 103294.09it/s]


pRK2


100%|███████████████████████████████████████████████████████████████████████████| 5359/5359 [00:00<00:00, 92198.95it/s]


Chromosome


100%|█████████████████████████████████████████████████████████████████████| 4900967/4900967 [00:55<00:00, 88016.83it/s]


## Drosophila melanogaster

In [None]:
d_melanogaster_di_fr, d_melanogaster_di_fr_th = dnf.obs_vs_theo_dinuc("Drosophila_melanogaster.BDGP6.32.dna.toplevel.fa.gz")

## Platynereis dumerilii

In [None]:
p_dumerilii_di_fr, p_dumerilii_di_fr = dnf.obs_vs_theo_dinuc("GCA_026936325.1_EMBL_pdum_1.0_genomic.fna.dz")

## Arabidopsis thaliana

In [None]:
a_thaliana_di_fr, a_thaliana_di_fr_th = dnf.obs_vs_theo_dinuc("Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz")

## Saccharomyces cerevisiae

In [None]:
s_cerevisiae_di_fr, s_cerevisiae_di_fr_th = dnf.obs_vs_theo_dinuc("Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz")

## Caenorhabditis elegans

In [None]:
c_elegans_di_fr, c_elegans_di_fr_th = dnf.obs_vs_theo_dinuc("Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz")

## Bacillus subtilis

In [21]:
with tarfile.open("Bacillus_subtilis.tar", "r") as tar:
    for member in tar.getmembers():
        if member.name.endswith('.gz'):
            handle = tar.extractfile(member)
            b_subtilis_di_fr, b_subtilis_di_fr_th = dnf.obs_vs_theo_dinuc(handle)

NC_000964.3


100%|█████████████████████████████████████████████████████████████████████| 4215605/4215605 [01:08<00:00, 61571.49it/s]


## Halobacterium salinarum

In [22]:
with tarfile.open("Halobacterium_salinarum.tar", "r") as tar:
    for member in tar.getmembers():
        if member.name.endswith('.gz'):
            handle = tar.extractfile(member)
            h_salinarum_di_fr, h_salinarum_fr_th = dnf.obs_vs_theo_dinuc(handle)

NC_002607.1


100%|█████████████████████████████████████████████████████████████████████| 2014238/2014238 [00:51<00:00, 39462.68it/s]


NC_001869.1


100%|███████████████████████████████████████████████████████████████████████| 191345/191345 [00:06<00:00, 30199.39it/s]


NC_002608.1


100%|███████████████████████████████████████████████████████████████████████| 365424/365424 [00:09<00:00, 38110.38it/s]


In [26]:
h_salinarum_di_fr, h_salinarum_fr_th

({'TT': 0.0274,
  'GT': 0.0682,
  'CT': 0.0458,
  'AT': 0.0289,
  'TG': 0.0514,
  'GG': 0.0851,
  'CG': 0.1474,
  'AG': 0.0458,
  'TC': 0.0747,
  'GC': 0.1017,
  'CC': 0.0851,
  'AC': 0.0682,
  'TA': 0.0168,
  'GA': 0.0747,
  'CA': 0.0514,
  'AA': 0.0274},
 {'TT': 0.029,
  'GT': 0.0561,
  'CT': 0.0561,
  'AT': 0.029,
  'TG': 0.0561,
  'GG': 0.1087,
  'CG': 0.1087,
  'AG': 0.0561,
  'TC': 0.0561,
  'GC': 0.1087,
  'CC': 0.1087,
  'AC': 0.0561,
  'TA': 0.029,
  'GA': 0.0561,
  'CA': 0.0561,
  'AA': 0.029})

## Gallus gallus

In [None]:
g_gallus_di_fr, g_gallus_di_fr_th = dnf.obs_vs_theo_dinuc("Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.dna.toplevel.fa.gz")

## Mus musculus

In [None]:
m_musculus_di_fr, m_musculus_di_fr_th = dnf.obs_vs_theo_dinuc("Mus_musculus.GRCm39.dna.toplevel.fa.gz")

## Danio rerio

In [None]:
d_rerio_di_fr, d_rerio_di_fr_th = dnf.obs_vs_theo_dinuc("Danio_rerio.GRCz11.dna.toplevel.fa.gz")

## Homo sapiens

In [None]:
h_sapiens_di_fr, h_sapiens_di_fr_th = dnf.obs_vs_theo_dinuc("Homo_sapiens.GRCh38.dna.toplevel.fa.gz")

Analysing Homo_sapiens genome 



100%|████████████████████████████████████████████████████████████████| 248956421/248956421 [40:28<00:00, 102530.12it/s]
100%|█████████████████████████████████████████████████████████████████| 242193528/242193528 [40:56<00:00, 98595.47it/s]
100%|█████████████████████████████████████████████████████████████████| 198295558/198295558 [34:42<00:00, 95228.30it/s]
100%|█████████████████████████████████████████████████████████████████| 190214554/190214554 [32:38<00:00, 97115.90it/s]
100%|█████████████████████████████████████████████████████████████████| 181538258/181538258 [31:58<00:00, 94609.41it/s]
100%|█████████████████████████████████████████████████████████████████| 170805978/170805978 [31:59<00:00, 88966.96it/s]
100%|█████████████████████████████████████████████████████████████████| 159345972/159345972 [32:16<00:00, 82294.22it/s]
100%|█████████████████████████████████████████████████████████████████| 145138635/145138635 [33:17<00:00, 72678.14it/s]
100%|███████████████████████████████████

In [None]:
h_sapiens_di_fr, h_sapiens_di_fr_th

## Saving data

In [21]:
observed = [g_gallus_di_fr, e_coli_di_fr, d_melanogaster_di_fr, a_thaliana_di_fr, s_cerevisiae_di_fr, 
            c_elegans_di_fr, m_musculus_di_fr, d_rerio_di_fr, p_dumerilii_di_fr]
theo = [g_gallus_di_fr_th, e_coli_di_fr_th, d_melanogaster_di_fr_th, a_thaliana_di_fr_th, s_cerevisiae_di_fr_th, 
        c_elegans_di_fr_th, m_musculus_di_fr_th, d_rerio_di_fr_th, p_dumerilii_di_fr_th]
organisms = {0:'G.gallus', 1:'E.coli', 2:'D.melanogaster', 3:'A.thaliana', 4:'S.cerevisiae', 
             5:'C.elegans', 6: 'M.musculus', 7: 'D.rerio', 8: 'P.dumerilii'}

for dic in observed:
    observed1 = {k: dic[k] for k in sorted(dic)}
for dic in theo:
    theo1 = {k: dic[k] for k in sorted(dic)}

with open("observed.json", "w") as f:
    json.dump(observed1, f)

with open("organisms.json", "w") as f:
    json.dump(organisms, f)    
    
with open("theo.json", "w") as f:
    json.dump(theo1, f)    

Unnamed: 0,AA,AC,AG,AT,CA,CC,CG,CT,GA,GC,GG,GT,TA,TC,TG,TT
E.coli,0.073,0.0552,0.0511,0.0668,0.0698,0.0584,0.0747,0.0511,0.0576,0.0828,0.0584,0.0552,0.0457,0.0576,0.0698,0.073
D.melanogaster,0.1017,0.0526,0.0544,0.0813,0.0679,0.0463,0.0413,0.0544,0.0557,0.0554,0.0463,0.0526,0.0646,0.0557,0.0679,0.1017
A.thaliana,0.1156,0.0523,0.0595,0.0924,0.0635,0.0339,0.0235,0.0595,0.0641,0.03,0.0339,0.0523,0.0766,0.0641,0.0635,0.1156
S.cerevisiae,0.1084,0.0525,0.0582,0.0902,0.0645,0.0388,0.0292,0.0582,0.0621,0.0373,0.0388,0.0525,0.0743,0.0621,0.0645,0.1084
C.elegans,0.1354,0.0483,0.0506,0.0885,0.0619,0.0335,0.0312,0.0506,0.0621,0.0333,0.0335,0.0483,0.0634,0.0621,0.0619,0.1354
B.subtilis,0.0987,0.0462,0.0561,0.0814,0.0666,0.0458,0.049,0.0561,0.0654,0.0603,0.0458,0.0462,0.0517,0.0654,0.0666,0.0987
H.salinarum,0.0274,0.0682,0.0458,0.0289,0.0514,0.0851,0.1474,0.0458,0.0747,0.1017,0.0851,0.0682,0.0168,0.0747,0.0514,0.0274
G.gallus,0.094,0.0519,0.0723,0.0707,0.0767,0.05,0.012,0.0723,0.0595,0.0496,0.05,0.0519,0.0589,0.0595,0.0767,0.094
M.musculus,0.0913,0.0533,0.0734,0.0738,0.0745,0.0522,0.0083,0.0734,0.0622,0.0406,0.0522,0.0533,0.0638,0.0622,0.0745,0.0913
D.rerio,0.1106,0.0565,0.0572,0.0926,0.0732,0.0348,0.0179,0.0572,0.0525,0.0392,0.0348,0.0565,0.0806,0.0525,0.0732,0.1106
