In [6]:
import os
import gzip
import pickle
import requests
import subprocess
import statistics
from collections import defaultdict

import pandas as pd
from Bio import SeqIO

In [8]:
if not os.path.exists('./genomes'):
    os.makedirs('./genomes')

def download_file(url, outfile):
    r = requests.get(url, allow_redirects=True)
    open(outfile, 'wb').write(r.content)

def unzip_db(filename):
    cmd = ['unzip', filename, '-d', 'taxonomy']
    unzip_call = subprocess.call(cmd, shell=False, stderr=subprocess.STDOUT)

refseq_handle = "https://ftp.ncbi.nih.gov/genomes/refseq/assembly_summary_refseq.txt"
taxdmp = 'https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip'

download_file(refseq_handle, 'assembly_summary_refseq.txt')
download_file(taxdmp, 'taxdmp.zip')
unzip_db('taxdmp.zip')

df = pd.read_csv(refseq_handle, skiprows=1, sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
# parse the nodes.dmp file from ncbi taxonomy
node_dict = {}
nodes_file = open("./taxonomy/nodes.dmp")
for row in nodes_file:
    row = row.strip().split('\t|\t')
    child = row[0]
    parent = row[1]
    tax_level = row[2]
    node_dict[child] = parent
    

def assign_genomes(taxid, node_dict = node_dict):
    """
    Given a NCBI taxonomy ID, this function figures out the domain that ID belongs to.
    """
    eukaryote = 2759
    bacteria = 2
    archae = 2157
    virus = 10239
    
    taxa_list = [eukaryote, bacteria, archae, virus]
    classification = "not classified"
        
    while taxid not in taxa_list:
        try:
            taxid = int(node_dict[str(taxid)])
            return(assign_genomes(taxid, node_dict))
        except:
            return(classification)

    if taxid == eukaryote: classification = 'eukaryote'
    if taxid == bacteria: classification = 'bacteria'
    if taxid == archae: classification = 'archae'
    if taxid == virus: classification = 'virus'
    
    return(classification)

In [12]:
df['Classification'] = df['taxid'].apply(assign_genomes)

In [5]:
# pick X number of random genomes per taxa and download them
sampled_euks = df[df['Classification'] == 'eukaryote'].sample(20, random_state=234)
sampled_bact = df[df['Classification'] == 'bacteria'].sample(100, random_state=345)

sampled_euks[sampled_euks.columns[0]] = sampled_euks[sampled_euks.columns[0]].str.split('.').str[0]
sampled_bact[sampled_bact.columns[0]] = sampled_bact[sampled_bact.columns[0]].str.split('.').str[0]

sampled_euks_list = sampled_euks[sampled_euks.columns[0]].to_list()
sampled_bact_list = sampled_bact[sampled_bact.columns[0]].to_list()

for ftp in sampled_euks['ftp_path'].to_list():
    genome_id = ftp.split("/")[-1]
    gen_url = os.path.join(ftp,genome_id + "_genomic.fna.gz").replace(" ", "_")
    r = requests.get(gen_url, allow_redirects=True)
    open("genomes/" + genome_id + ".fna.gzip", 'wb').write(r.content)

for ftp in sampled_bact['ftp_path'].to_list():
    genome_id = ftp.split("/")[-1]
    gen_url = os.path.join(ftp,genome_id + "_genomic.fna.gz").replace(" ", "_")
    r = requests.get(gen_url, allow_redirects=True)
    open("genomes/" + genome_id + ".fna.gzip", 'wb').write(r.content)

In [None]:
def is_valid_sequence(seq):
    ''' Used to remove any kmers with N's in them '''
    return set(seq).issubset({"A", "T", "C", "G"})

kmers_visited = set()
kmer_dict = defaultdict(lambda:0)

directory = "./genomes"
euk_count = 0
bact_count = 0
for filename in os.listdir(directory):
    if filename.endswith(".fna.gzip"):
        genome_bp_count = 0
        
        genome_id = "_".join(filename.split("_")[0:2]).split('.')[0]
        
        if genome_id in sampled_euks_list: genome_type = 'euk'
        elif genome_id in sampled_bact_list: genome_type = 'bact'
        else: genome_type = 'NA'
            
        #print(genome_id, genome_type)
        
        with gzip.open(os.path.join(directory, filename), "rt") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                if genome_bp_count < 10000000:
                    if len(record.seq) > 5000:
                        for i in range(0, len(record.seq), 5000):
                            genome_bp_count += 5000
                            if genome_type == 'euk': 
                                euk_count += 1
                                contig_5kb_name = "euk_" + str(euk_count)
                            elif genome_type == 'bact': 
                                bact_count += 1
                                contig_5kb_name = "bact_" + str(bact_count)
                                
                            if genome_bp_count < 10000000:
                                contig_5kb = record.seq[i:i+5000]
                                for j in range(len(contig_5kb)):
                                    kmer = contig_5kb[j:j+5].upper()
                                    if len(kmer) == 5:
                                        if is_valid_sequence(kmer) is True:
                                            kmers_visited.add(str(kmer))
                                            kmer_dict[(contig_5kb_name, str(kmer))] += 1
                else:
                    continue


GCF_000002995 euk
GCF_000149685 euk
GCF_000185845 bact
GCF_000192795 euk
GCF_000214015 euk
GCF_000223845 euk
GCF_000231095 euk
GCF_000292625 euk
GCF_000370305 bact
GCF_000464535 euk
GCF_000475195 euk
GCF_000503715 bact
GCF_000585565 euk
GCF_000621785 bact
GCF_000653655 bact
GCF_000756225 bact
GCF_000820425 bact
GCF_000876915 bact
GCF_000938615 bact
GCF_001037665 bact
GCF_001085385 bact
GCF_001089745 bact
GCF_001141605 bact
GCF_001220925 bact
GCF_001253695 bact
GCF_001255635 bact
GCF_001475075 bact
GCF_001520655 bact
GCF_001568155 bact
GCF_001572005 bact
GCF_001594535 bact
GCF_001648945 bact
GCF_001857705 euk
GCF_002033825 bact
GCF_002076415 bact
GCF_002090455 bact
GCF_002178075 bact
GCF_002205495 bact
GCF_002230975 bact
GCF_002232505 bact
GCF_002273155 bact
GCF_002283795 bact
GCF_002486075 bact
GCF_002514525 bact
GCF_002520825 bact


In [None]:

with open('saved_dictionary.pkl', 'wb') as f:
    pickle.dump(kmer_dict, f)

#with open('saved_dictionary.pkl', 'rb') as f:
 #   loaded_dict = pickle.load(f)


In [14]:
len(kmers_visited)

1024