In [1]:
import os
import gzip
import pickle
import requests
from collections import defaultdict

import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from sklearn.metrics import accuracy_score

Make directories and read in list of kmers (used to ensure correct order), the model, and the RefSeq genome summary table (pre-classified as to which one is eukaryotic, bacterial, archael, or viral).

In [2]:
if not os.path.exists('./Validation_genomes'):
    os.makedirs('./Validation_genomes')

df = pd.read_csv("assembly_summary_refseq_labeled.csv")
clf = pickle.load(open("kmer_SVM.sav", 'rb'))
kmer_list = pickle.load(open("kmer_list.pkl", 'rb'))

  interactivity=interactivity, compiler=compiler, result=result)


Download 10 eukaryotic and 10 bacterial genomes to test. Note: While I am only testing 10 of each, I break up these genomes into smaller contigs and thus end up with thousands of contigs I am actually testing. Another way to approach this could be to download more genomes and sample less contigs from each genome.

In [3]:
sampled_euks = df[df['Classification'] == 'eukaryote'].sample(10, random_state=100)
sampled_bact = df[df['Classification'] == 'bacteria'].sample(10, random_state=200)

sampled_euks['# assembly_accession'] = sampled_euks['# assembly_accession'].str.split('.').str[0]
sampled_bact['# assembly_accession'] = sampled_bact['# assembly_accession'].str.split('.').str[0]

sampled_euks_list = sampled_euks['# assembly_accession'].to_list()
sampled_bact_list = sampled_bact['# assembly_accession'].to_list()

for ftp in sampled_euks['ftp_path'].to_list():
    genome_id = ftp.split("/")[-1]
    gen_url = os.path.join(ftp,genome_id + "_genomic.fna.gz").replace(" ", "_")
    r = requests.get(gen_url, allow_redirects=True)
    open("Validation_genomes/" + genome_id + ".fna.gzip", 'wb').write(r.content)

for ftp in sampled_bact['ftp_path'].to_list():
    genome_id = ftp.split("/")[-1]
    gen_url = os.path.join(ftp,genome_id + "_genomic.fna.gz").replace(" ", "_")
    r = requests.get(gen_url, allow_redirects=True)
    open("Validation_genomes/" + genome_id + ".fna.gzip", 'wb').write(r.content)

Compute the canonical 5-mers from the downloaded genomes.

In [4]:
def is_valid_sequence(seq):
    ''' Used to remove any kmers with N's in them '''
    return set(seq).issubset({"A", "T", "C", "G"})

kmer_dict = defaultdict(lambda:0)
contigs = set(" ")

directory = "./Validation_genomes"
euk_count = 0
bact_count = 0
for filename in os.listdir(directory):
    if filename.endswith(".fna.gzip"):
        genome_bp_count = 0
        
        genome_id = "_".join(filename.split("_")[0:2]).split('.')[0]
        
        if genome_id in sampled_euks_list: genome_type = 'euk'
        elif genome_id in sampled_bact_list: genome_type = 'bact'
        else: genome_type = 'NA'
            
        print(genome_id, genome_type)
        
        with gzip.open(os.path.join(directory, filename), "rt") as handle:
            for record in SeqIO.parse(handle, "fasta"):
                if genome_bp_count < 1000000:
                    if len(record.seq) >= 50000:
                        for i in range(0, len(record.seq), 50000):
                            genome_bp_count += 50000
                            if genome_type == 'euk': 
                                euk_count += 1
                                contig_5kb_name = "euk_" + str(euk_count)
                            elif genome_type == 'bact': 
                                bact_count += 1
                                contig_5kb_name = "bact_" + str(bact_count)
                            contigs.add(contig_5kb_name)
                            
                            if genome_bp_count < 1000000:
                                contig_5kb = record.seq[i:i+50000] 
                                for j in range(len(contig_5kb)):
                                    kmer = contig_5kb[j:j+5].upper()
                                    if kmer.reverse_complement() in kmer_list:
                                        kmer = kmer.reverse_complement()
                                    if len(kmer) == 5:
                                        if is_valid_sequence(kmer) is True:
                                            kmer_dict[(contig_5kb_name, str(kmer))] += 1
                else:
                    continue


GCF_000002415 euk




GCF_000313135 euk
GCF_000325025 bact
GCF_000523455 euk
GCF_000955945 euk
GCF_000986985 bact
GCF_001500285 euk
GCF_001548555 euk
GCF_001613715 bact
GCF_002000525 bact
GCF_002512915 bact
GCF_003086295 euk
GCF_008831285 euk
GCF_009648635 bact
GCF_013052645 euk
GCF_017639745 euk
GCF_017920825 bact
GCF_018350345 bact
GCF_020163235 bact
GCF_900179305 bact


Output the results of counts of each 5-mer in each contig in a table.

In [5]:
f = open("kmer_matrix_validation.tsv", "w")

f.write("Contig" + "\t")
f.write("\t".join(contigs))
f.write("\n")
for kmer in kmer_list:
    f.write(kmer + "\t")
    for contig in contigs:
        f.write(str(kmer_dict[(contig, kmer)]) + "\t")
    f.write("\n")
    
f.close()

Read the kmer count table in as a pandas dataframe, remove contigs with few kmers, and divide each count by the sum of kmer counts on the contig (to get proportional data).

In [6]:
kmer_df = pd.read_csv("kmer_matrix_validation.tsv", sep="\t", index_col=False)
kmer_df = kmer_df.set_index("Contig")
kmer_df = kmer_df.loc[:, kmer_df.sum(axis=0) > 10000]
kmer_df = kmer_df.div(kmer_df.sum(axis=0),axis=1)

Need to make sure the dataframe is sorted the exact same way as the model was originally run.

In [7]:
kmer_df = kmer_df.reindex(sorted(kmer_list))

Assign labels and prepare a list of lists of 5-mer frequencies.

In [9]:
contig_kmers = []
for column in kmer_df:
    kmer_list_contig = kmer_df[column].tolist()
    contig_kmers.append(kmer_list_contig)
    
prelabels = kmer_df.columns.tolist()
labels = []
for i in prelabels:
    if i.split("_")[0] == "euk":
        labels.append(1)
    else:
        labels.append(0)

Perform the predictions and assess accuracy.

In [10]:
predictions = clf.predict(contig_kmers)
print(f"Accuracy in the validation set is {accuracy_score(labels, predictions) * 100}%")

Accuracy in the validation set is 99.71014492753623%
