# Dinucleotide frequencies in reference genomes

In [57]:
pip install biopython -q

Note: you may need to restart the kernel to use updated packages.


In [1]:
import gzip
import json
import requests

from Bio import SeqIO
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
import tarfile
from tqdm import tqdm

%reload_ext autoreload
%autoreload 2
import dinucleo_freq as dnf

## Escherichia coli

In [3]:
url="http://ftp.ensemblgenomes.org/pub/bacteria/release-52/fasta/bacteria_26_collection/escherichia_coli_w_gca_000184185/dna/Escherichia_coli_w_gca_000184185.ASM18418v1.dna.toplevel.fa.gz"
response = requests.get(url)
with open(url.split("/")[-1], 'wb') as f:
    f.write(response.content)

e_coli_di_fr, e_coli_di_fr_th = dnf.obs_vs_theo_dinuc(url.split("/")[-1])

😼Analysing Escherichia coli genome
🙀Total number of records: 3
😸Analysing record 1: pRK1, sequence length: 102536, number of records rest: 2
😸Analysing record 2: pRK2, sequence length: 5360, number of records rest: 1
😸Analysing record 3: Chromosome, sequence length: 4900968, number of records rest: 0
😻Completed


## Drosophila melanogaster

In [None]:
d_melanogaster_di_fr, d_melanogaster_di_fr_th = dnf.obs_vs_theo_dinuc("Drosophila_melanogaster.BDGP6.32.dna.toplevel.fa.gz")

## Platynereis dumerilii

In [None]:
p_dumerilii_di_fr, p_dumerilii_di_fr = dnf.obs_vs_theo_dinuc("GCA_026936325.1_EMBL_pdum_1.0_genomic.fna.dz")

## Arabidopsis thaliana

In [None]:
a_thaliana_di_fr, a_thaliana_di_fr_th = dnf.obs_vs_theo_dinuc("Arabidopsis_thaliana.TAIR10.dna.toplevel.fa.gz")

## Saccharomyces cerevisiae

In [None]:
s_cerevisiae_di_fr, s_cerevisiae_di_fr_th = dnf.obs_vs_theo_dinuc("Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa.gz")

## Capitella teleta

In [None]:
c_teleta_di_fr, c_teleta_di_fr_th = dnf.obs_vs_theo_dinuc('Capitella_teleta.Capitella_teleta_v1.0.dna.toplevel.fa.gz')

## Caenorhabditis elegans

In [None]:
c_elegans_di_fr, c_elegans_di_fr_th = dnf.obs_vs_theo_dinuc("Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz")

## Bacillus subtilis

In [None]:
with tarfile.open("Bacillus_subtilis.tar", "r") as tar:
    for member in tar.getmembers():
        if member.name.endswith('.gz'):
            handle = tar.extractfile(member)
            b_subtilis_di_fr, b_subtilis_di_fr_th = dnf.obs_vs_theo_dinuc(handle)

## Halobacterium salinarum

In [127]:
with tarfile.open("Halobacterium_salinarum.tar", "r") as tar:
    for member in tar.getmembers():
        if member.name.endswith('.gz'):
            handle = tar.extractfile(member)
            h_salinarum_di_fr, h_salinarum_fr_th = dnf.obs_vs_theo_dinuc(handle)

😼Analysing Halobacterium salinarum genome
😻Csompleted


## Gallus gallus

In [None]:
g_gallus_di_fr, g_gallus_di_fr_th = dnf.obs_vs_theo_dinuc("Gallus_gallus.bGalGal1.mat.broiler.GRCg7b.dna.toplevel.fa.gz")

## Mus musculus

In [None]:
m_musculus_di_fr, m_musculus_di_fr_th = dnf.obs_vs_theo_dinuc("Mus_musculus.GRCm39.dna.toplevel.fa.gz")

## Danio rerio

In [None]:
d_rerio_di_fr, d_rerio_di_fr_th = dnf.obs_vs_theo_dinuc("Danio_rerio.GRCz11.dna.toplevel.fa.gz")

## Homo sapiens

In [2]:
h_sapiens_di_fr, h_sapiens_di_fr_th = dnf.obs_vs_theo_dinuc("Homo_sapiens.GRCh38.dna.toplevel.fa.gz")

## Saving data

In [None]:
observed = [g_gallus_di_fr, e_coli_di_fr, d_melanogaster_di_fr, a_thaliana_di_fr, s_cerevisiae_di_fr, 
            c_elegans_di_fr, m_musculus_di_fr, d_rerio_di_fr, p_dumerilii_di_fr, b_subtilis_di_fr, h_salinarum_di_fr]
theo = [g_gallus_di_fr_th, e_coli_di_fr_th, d_melanogaster_di_fr_th, a_thaliana_di_fr_th, s_cerevisiae_di_fr_th, 
        c_elegans_di_fr_th, m_musculus_di_fr_th, d_rerio_di_fr_th, p_dumerilii_di_fr_th, b_subtilis_di_fr_th, h_salinarum_fr_th]
organisms = {0:'G.gallus', 1:'E.coli', 2:'D.melanogaster', 3:'A.thaliana', 4:'S.cerevisiae', 
             5:'C.elegans', 6: 'M.musculus', 7: 'P.dumerilii', 8: 'D.rerio', 9: 'B.subtilis', 10: 'H.salinarum'}

with open("observed.json", "w") as f:
    json.dump(observed, f)
with open("organisms.json", "w") as f:
    json.dump(organisms, f)        
with open("theo.json", "w") as f:
    json.dump(theo, f) 