In [1]:
%%writefile ../machine_learning.py

"""
Script aims to read in the reference_dataframe file, select a taxonomic
level and group, and read the path to the location of that data. It then
prepares data for machine learning by converting base pair coding to numerical
encoding, pads it out and then runs the algorithm
"""

import pandas as pd
from Bio import SeqIO
import numpy as np
import os
import random
import argparse
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Dropout, MaxPooling1D, Flatten
from keras.utils import plot_model
import math
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,precision_score,recall_score,f1_score

def max_seq_len(SeqIO_dict):
    """
    Function takes a SeqIO_dict and returns the lengths of the
    longest sequence
    """
    total_lens = []
    for key in SeqIO_dict.keys():
        total_lens.append(len(SeqIO_dict[key].seq))
    return max(total_lens)


def numberfy(SeqIO_dict, seq_len, nsubsample, genus_name, species_name):
    """
    Take SeqIO_dict and return SeqIO_dict were bases have been replaced
    with numbers
    ACGT- replaced with 01234
    Take the seq_len each sequence should have
    """
    num_dict = {}
    
    keys = list(SeqIO_dict.keys())
    randkeys = random.sample(keys, k=nsubsample)
    
    with open(data_root+'models/ids_%s_%s_%s_%s_%s.txt' % (args.tax_rank,args.name,args.n_reads,genus_name,species_name), 'w+') as file:
        file.writelines("%s\n" % key for key in randkeys)
    
    
    for key in randkeys:
        seq = str(SeqIO_dict[key].seq).replace("A",'0 ')\
        .replace("C",'1 ').replace("G",'2 ').replace("T",'3 ')\
        .replace("a",'0 ').replace("c",'1 ').replace("g",'2 ')\
        .replace("t",'3 ')
#         seq_new = seq + '4 '*(seq_len - int(len(seq)/2))
        seq_new = seq + '4 '*(5000 - int(len(seq)/2))
        if seq_new.find('t') != -1:
            print(seq_new.find('t'))
            print("ERROR - strange value in sequence")
            print(seq_new)
            exit()
        num_dict[key] = list(map(int, seq_new.split(' ')[:-1]))
    return num_dict


def get_model(X_train, Y_train, num_class):
    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(5000,1)))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(num_class, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    print()
    model.fit(np.expand_dims(X_train,2), Y_train, validation_data=(np.expand_dims(X_test,2), Y_test), batch_size=128, epochs=10, verbose=1)
    return model

parser = argparse.ArgumentParser(description="""
Script aims to read in the reference_dataframe file, select a taxonomic
level and group, and read the path to the location of that data. It then
prepares data for machine learning by converting base pair coding to numerical
encoding, pads it out and then runs the algorithm
""")
parser.add_argument("ref_df_fn", help="File path to the reference dataframe")
parser.add_argument("data_root", help="Root folder for analysis/")
parser.add_argument("--tax_rank", "-r", help="taxonomic rank for analysis")
parser.add_argument("--name", "-n", help="name of rank to select from")
parser.add_argument("--n_reads", "-c", help="count of reads per class")
parser.add_argument("--one", "-1", help="first species to test. not used if tax_rank and name present")
parser.add_argument("--two", "-2", help="second species to test. not used if tax_rank and name present")
group = parser.add_mutually_exclusive_group()
group.add_argument("--verbose", "-v", "--v", action="store_true")
group.add_argument("--quiet", "-q", "--q", action="store_true")
args = parser.parse_args()

# assign required arguments to variables
ref_df_fn = args.ref_df_fn
data_root = args.data_root

# assign a number of reads per class
n_reads = int(args.n_reads)

# test to make sure both required file paths are input
try:
    os.path.exists(ref_df_fn)
except:
    print('Cannot find %s' % ref_df_fn)
try:
    os.path.exists(data_root)
except:
    print('Cannot find %s' % data_root)
    
if args.verbose:
    print('\033[1;34m' + "Reference dataframe is at " + ref_df_fn + '\033[0m')
    print('\033[1;34m' + "Root directory is at " + data_root + '\033[0m')
    if args.tax_rank and args.name:
        print('\033[1;34m' + "Tax Rank is " + args.tax_rank.lower() + '\033[0m')
        print('\033[1;34m' + "Name is " + args.name.lower() + '\033[0m')
    elif args.one and args.two:
        print('\033[1;34m' + "Species one is " + args.one.lower() + '\033[0m')
        print('\033[1;34m' + "Species two is " + args.two.lower() + '\033[0m')
    print('\033[1;34m' + "Count of reads per sample is", n_reads,'\033[0m')
    
# read in the reference dataframe from the argument path
ref_df = pd.read_csv(ref_df_fn, index_col=None)

# check whether the reference dataframe implies there are enough reads
# to continue given n_reads
try:
    if ref_df[ref_df["# for use"] \
              < n_reads].shape[0] > 0 :
        print("These species need more reads.")
        print(ref_df[ref_df["# for use"] \
              < n_reads])
        exit()
except:
    print('Check %s to have the wanted column names' % ref_df_fn)
    
# assign flagged variables as lower case and assign indices dataframe
if args.tax_rank and args.name:
    tax_rank = args.tax_rank.lower()
    name = args.name.lower()
    try:
        indices = ref_df[ref_df[tax_rank] == name].index
        print(indices)
    except:
        print("Tax_rank or Name not found in reference_dataframe")
        print(tax_rank, name)
elif args.one and args.two:
    one = args.one.lower()
    two = args.two.lower()
    try:
        indices = ref_df[(ref_df['species'] == one)&(ref_df['species'] == two)].index
        print(indices)
    except:
        print("Species inputs not found in reference_dataframe")
        print(one, two)

# where the values are that index's path's dataframe
SeqIO_dicts = {}
for index in indices:
    fasta_path = ref_df.loc[index, 'path for use']
    try:
        SeqIO_dicts[index] = SeqIO.to_dict(SeqIO.parse(fasta_path, "fasta"))
    except:
        print('Check location of fasta files')
        print(fasta_path, "does not exist")
        
# each path within an index corresponds to a species
# if tax_rank > genus, we want to look at which species are within which genus/family/order etc.

# determine the maximum sequence length of accepted sequences
total_lens = []
for key, value in SeqIO_dicts.items():
    total_lens.append(max_seq_len(value))
print('\033[0;32m'+"The maximum sequence length of all sampled sequences is"+ '\033[1;37m',max(total_lens),'\033[0m')


# randomly subsample n_reads number of reads from each index's corresponding
# set of reads, convert base pair coding to numerical coding and 
# pad to the max sequence length
numSeqIO_dicts = {}
max_len = max(total_lens)
del total_lens
if (args.one and args.two) or tax_rank == "genus":
    for key, value in SeqIO_dicts.items():
        numSeqIO_dicts[key] = numberfy(value, max_len, n_reads,ref_df.loc[key,'genus'], ref_df.loc[key,'species'])
else:
    location = (ref_df.columns.get_loc(tax_rank)-1)
    col_name = ref_df.columns[location]
    if args.verbose:
        print('location is', col_name)

    classes = ref_df.iloc[indices,location].unique()
    if args.verbose:
        print('classes are', classes)

    count_dict = {}
    for class_ in classes:
        count_dict[class_] = sum(ref_df.iloc[indices,location] == class_)
    if args.verbose:
        print('count_dict is', count_dict)
    
    del classes
        
    min_vals = []
    for class_, n_class in count_dict.items():
#         if n_class == min(count_dict.values()):
            min_vals.append(ref_df[ref_df.iloc[:,location] == class_]['# for use'].min())
    if min(min_vals) % 2 == 0:
        minimum_value = int(min(min_vals))
    else:
        minimum_value = int(min(min_vals)-1)
    if minimum_value > 35000:
        minimum_value = 35000
        
    del min_vals
    
    if args.verbose:
        print('minimum number of reads is', minimum_value)
    class_lens_ind = []
    if len(count_dict) > 1:
        max_reads = 0
        for key, value in count_dict.items():
            if value == max(count_dict.values()):
                max_reads = value*n_reads

        if max_reads <= minimum_value:
            minimum_value = max_reads

        for key, n_class in count_dict.items():
            s_reads = int(minimum_value/n_class)
            if ref_df[ref_df.loc[:,col_name]==key]['# for use'].min() < s_reads:
                minimum_value = ref_df[ref_df.loc[:,col_name]==key]['# for use'].min()/n_class
                s_reads = int(minimum_value/n_class)
            if args.verbose:
                print('The class is', key, 'and the number of reads to be subsampled is', s_reads)
            for keya, value in SeqIO_dicts.items():
                if ref_df.loc[keya,col_name] == key:
                    numSeqIO_dicts[keya] = numberfy(value, max_len, s_reads, ref_df.loc[keya,'genus'], ref_df.loc[keya,'species'])
                    class_lens_ind.append(s_reads)
        n_reads = minimum_value
    elif len(count_dict) == 1:
        s_reads = n_reads
        print("no comparison for the rank")
        exit()

try:
    del count_dict
except:
    pass

location = (ref_df.columns.get_loc(tax_rank)-1)
col_name = ref_df.columns[location]
classes = ref_df.iloc[indices,location].unique()

del indices

order = []
seq_list = []
total_expected_reads = len(classes)*n_reads
class_lens = []
for class_ in classes:
    tmp_sum = []
    for key in numSeqIO_dicts.keys():
        if ref_df.loc[key,col_name] == class_:
            order.append(key)
            seq_list.append(np.array(list(numSeqIO_dicts[key].values())))
            tmp_sum.append(len(list(numSeqIO_dicts[key].values())))
    class_lens.append(sum(tmp_sum))

try:
    del tmp_sum
except:
    pass
try:
    del numSeqIO_dicts
except:
    pass
    
total_actual_reads = min(class_lens)

print(class_lens)
if args.verbose:
    print("Ids order for labels is", order)
    if not (args.one and args.two) and tax_rank != "genus":
        print("Number of reads subsampled per id is", class_lens_ind)
    print("Total expected reads is", total_expected_reads)
    for i in range(0, len(classes)):
        print(classes[i], "has", class_lens[i], "reads")
    print("Total reads used per class is", sum(class_lens))
    print("Total actual reads available per class is", total_actual_reads)

try:
    del class_lens_ind
except:
    pass
    
seq_comb = np.concatenate(seq_list, axis = 0)
num_class = len(classes)

try:
    del seq_list
except:
    pass

if len(set(class_lens)) == 1:
    all_data = seq_comb
else:
    class_lens_cumsum = np.cumsum(class_lens)
    new_seq_list = []
    for i in range(0, len(class_lens_cumsum)):
        if i == 0:
            new_seq_list.append(seq_comb[0:class_lens_cumsum[i]][:total_actual_reads])
        else:
            new_seq_list.append(seq_comb[class_lens_cumsum[i-1]:class_lens_cumsum[i]][:total_actual_reads])
    del seq_comb
    all_data = np.concatenate(new_seq_list, axis = 0)

try:
    del class_lens
except:
    pass
try:
    del new_seq_list
except:
    pass
    
# determine the number of classes and generate an array of ids
all_labels_onehot = np.zeros( (total_actual_reads*num_class,num_class) )
for i in range(0, num_class):
    all_labels_onehot[i*total_actual_reads:(i+1)*total_actual_reads,i] = 1

# Print the shape of the resulting dataframes to visually verify
if args.verbose:
    print('all_labels_onehot.shape: ', all_labels_onehot.shape)
    print('all_data.shape:', all_data.shape)

# # Separate the data into separate classes based on the labels
# classes_dict = {}
# for i in range(0, len(classes)):
#     classes_dict[classes[i]] = all_data[i*total_actual_reads:(i+1)*total_actual_reads,:]

# # Print an entry to visualise this
# # Print the shape of these new arrays to visually verify
# for entry in classes_dict:
#     print(classes_dict[entry][50])
#     if args.verbose:
#         print('%s all_data shape:' % entry, classes_dict[entry].shape)

samples_count = total_actual_reads*num_class
if args.verbose:
    print('samples_per_class:', total_actual_reads)
    print('samples_count:', samples_count)

# Create a method for shuffling data
shuffle_indices = random.sample(range(0, samples_count), samples_count)
if args.verbose:
    print(len(shuffle_indices))

# Assign a percentage of data for training and the rest for testing
train_size = math.floor(0.85*all_data.shape[0])
if args.verbose:
    print("Training data size:", train_size)
indices_train = shuffle_indices[0:train_size]
indices_test = shuffle_indices[train_size+1:samples_count]

try:
    del train_size
except:
    pass
try:
    del samples_count
except:
    pass
try:
    del shuffle_indices
except:
    pass

# Define the data vs labels for each of the training and test sets
X_train = all_data[indices_train,:]
Y_train = all_labels_onehot[indices_train]
X_test = all_data[indices_test,:]
Y_test = all_labels_onehot[indices_test]

try:
    del all_data
except:
    pass

if args.verbose:
    print('X_train.shape : ', X_train.shape)
    print('X_test.shape : ', X_test.shape)
    print('Y_train.shape : ', Y_train.shape)
    print('Y_test.shape : ', Y_test.shape)

# Define the input dimension from X_train.shape[1]
in_dim = X_train.shape[1]

# run the model as defined in the get_model function
# model = get_model(X_train, Y_train, num_class)
# model = Sequential()
# model.add(Dense(32, activation='relu', input_dim=in_dim))
# model.add(Dense(16, activation='relu'))
# model.add(Dense(num_class, activation='softmax'))
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

classes_dict = {}
for i in range(0, len(classes)):
    classes_dict['%s' % i] = classes[i]

# model.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=100, epochs=100, verbose=1)

model = Sequential()
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1],1)))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(Dropout(0.5))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(num_class, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()
print()
history = model.fit(np.expand_dims(X_train,2), Y_train, validation_data=(np.expand_dims(X_test,2), Y_test), batch_size=128, epochs=10, verbose=1)

# plot? the history of the model training accuracy vs val_accuracy
    # could probably put this into a function as well
# history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=100, epochs=100, verbose=1)
model.save(data_root+'models/model_%s_%s_%s.h5' % (args.tax_rank,args.name,args.n_reads))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy for %s %s' % (name, tax_rank))
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()
plt.savefig(data_root+'plot_histories/history_%s_%s_%s.png' % (args.tax_rank,args.name,args.n_reads))
plt.close()

yhat_probs = model.predict(np.expand_dims(X_test,2), verbose=0)
yhat_classes = model.predict_classes(np.expand_dims(X_test,2), verbose=0)
print(yhat_classes.shape)
print(yhat_classes)

Y_test_ints = np.where(Y_test==1)[1]
print(Y_test_ints.shape)
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:]
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y_test_ints, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(Y_test_ints, yhat_classes, average=None)
print('precision: ', precision)
# recall: tp / (tp + fn)
recall = recall_score(Y_test_ints, yhat_classes, average=None)
print('recall: ', recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y_test_ints, yhat_classes, average=None)
print('f1: ', f1)
# confusion matrix
matrix = confusion_matrix(Y_test_ints, yhat_classes)
print(matrix)
crosstab = pd.crosstab(Y_test_ints, yhat_classes, rownames=['True'], colnames=['Predicted'], margins=True)
print(crosstab)
data = [accuracy]
datacol =['accuracy']
count = 1
for i in precision:
    data.append(i)
    datacol.append('precision%i'%count)
    count += 1
count = 1
for i in recall:
    data.append(i)
    datacol.append('recall%i'%count)
    count += 1
count = 1
for i in f1:
    data.append(i)
    datacol.append('f1%i'%count)
    count += 1
stats = pd.DataFrame(data=[data],columns=datacol)
stats.to_csv(data_root+'models/stats_%s_%s_%s.csv' % (args.tax_rank,args.name,args.n_reads))
crosstab.to_csv(data_root+'models/confusion_%s_%s_%s.csv' % (args.tax_rank,args.name,args.n_reads))
with open(data_root+'models/keys_%s_%s_%s.csv' % (args.tax_rank,args.name,args.n_reads), 'w+') as f:
    for key in classes_dict.keys():
        f.write("%s,%s\n"%(key,classes_dict[key]))
if args.verbose:
    print(classes_dict)

Overwriting ../machine_learning.py


### Mock Community

In [6]:
%%time

import pandas as pd
from Bio import SeqIO
import numpy as np
import json
import os
import random
import argparse
import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Conv1D, Dropout, MaxPooling1D, Flatten
from keras.utils import plot_model, to_categorical
import math
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,precision_score,recall_score,f1_score

def getquery_taxfileid(ref_df, species):
    """
    Takes the reference dataframe filename and the species name.
    Returns the taxfileid, which is the date/flowcellid (column 0 value) of the ref_df.
    """
    return ref_df[ref_df.species == species].iloc[:,0].values[0]

def get_taxid_dict(taxid_fn, taxfileid):
    """
    Takes a taxonomy assignment file filename in the Qiime format and a taxonomic identifier.
    Returns the a dictionary with the taxonomic assignment at each rank.
    """
    tax_dict = {}
    with open(taxid_fn, 'r') as fh:
        for line in fh:
            if line.startswith(taxfileid):
                taxrankids = line.rstrip().split('\t')[1].split(';')
                for taxrank in taxrankids:
                    tax_dict[taxrank.split('__')[0]] = taxrank.split('__')[1]
    return tax_dict

def numberfy(SeqIO_dict, seq_len, nsubsample, species_name):
    """
    Take SeqIO_dict and return SeqIO_dict were bases have been replaced
    with numbers
    ACGT- replaced with 01234
    Take the seq_len each sequence should have
    """
    num_dict = {}
    
    randkeys = [SeqIO_dict.id]
#     print(randkeys)
    
    for key in randkeys:
        seq = str(SeqIO_dict.seq).replace("A",'0 ')\
        .replace("C",'1 ').replace("G",'2 ').replace("T",'3 ')\
        .replace("a",'0 ').replace("c",'1 ').replace("g",'2 ')\
        .replace("t",'3 ')
#         seq_new = seq + '4 '*(seq_len - int(len(seq)/2))
        seq_new = seq + '4 '*(5000 - int(len(seq)/2))
        if seq_new.find('t') != -1:
            print(seq_new.find('t'))
            print("ERROR - strange value in sequence")
            print(seq_new)
            exit()
        num_dict[key] = list(map(int, seq_new.split(' ')[:-1]))
    return num_dict

mock_taxonomy_file_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/mock_taxonomy_file_qiime.csv')
tax_ranks = ['kingdom',
             'phylum',
             'class', 'order', 'family', 'genus'
            ]
nodes = pd.read_csv('../../analysis/Stats/nodes.csv', sep=' ', header=None)
nodes.columns = ['tax_rank','tax_name']
ref_df = pd.read_csv('../../analysis/Stats/mock_reference_dataframe.csv', index_col=None)
large_ref_df = pd.read_csv('../../analysis/Stats/large_mock_reference_dataframe.csv', index_col=None)
full_mock_dict = SeqIO.to_dict(SeqIO.parse("../../analysis/Mapping/mock/subsample_reads/mock_community_200.fasta", "fasta"))
n_per_species = 200.

species_list = []
for key in full_mock_dict:
    if full_mock_dict[key].description.split(' ')[1] not in species_list:
        species_list.append(full_mock_dict[key].description.split(' ')[1])
all_values_dict = {}

for species in species_list:
    taxfileid = getquery_taxfileid(ref_df, species)
    query_tax_dict = get_taxid_dict(mock_taxonomy_file_fn, taxfileid)
    all_values_dict[species] = query_tax_dict,{'k': 0, 'p': 0, 'c': 0, 'o': 0, 'f': 0, 'g': 0, 's': 0}
# print(all_values_dict)
        
    
    
for i in range(0, len(full_mock_dict.keys())):
    # for i in range(600,800):
    # adjust the range here to test for candida species
    # as described in ranges in the above cell
    print('%s/%s' % (i+1,len(full_mock_dict.keys())))
    key = list(full_mock_dict.keys())[i]
    species_name = full_mock_dict[key].description.split(' ')[1]
    if i == 0 or i % 200 == 0:
        print(species_name)
    max_len = len(full_mock_dict[key].seq)

    # convert base pair coding to numerical coding and 
    # pad to the max sequence length
    n_reads = 1
    
    numSeqIO_dicts = {}
    numSeqIO_dicts[key] = numberfy(full_mock_dict[key], max_len, n_reads, species_name)
    seq_list = []
    for key in numSeqIO_dicts.keys():
        seq_list.append(np.array(list(numSeqIO_dicts[key].values())))
#     print(seq_list)

    all_data = np.concatenate(seq_list)
    num_class = len(numSeqIO_dicts)

#     print('all_data.shape:', all_data.shape)
    
    samples_count = n_reads*num_class

    # # Assign a percentage of data for training and the rest for testing
    train_size = math.floor(1*all_data.shape[0])

    # # Define the data vs labels for each of the training and test sets
    X_test = all_data[:,:]

#     print('X_test.shape : ', X_test.shape)

#     print(ref_df[ref_df['species'] == species_name]['kingdom'].to_list()[0])
    all_values_dict[species_name][1]['k'] += 1./n_per_species
    
    counter = 6
    for tax_rank in tax_ranks:
#         print('tax_rank =', tax_rank)
        if tax_rank == 'kingdom':
            model = load_model('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, 'fungi'))
            classes = pd.read_csv('../../analysis/models/keys_%s_%s_15000.csv' % (tax_rank, 'fungi'), header=None)
            classes.columns = ['predict','pred_name']
            scores = model.predict(np.expand_dims(X_test,2))
            predicts = model.predict_classes(np.expand_dims(X_test,2))
            predicted_class = classes[classes['predict'] == predicts[0]]['pred_name'].to_list()[0]
#             print('Predicted class is', predicted_class)

#             print(ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0])
            if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0] == predicted_class:
                all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
            counter -= 1
            keras.backend.clear_session()
        elif tax_rank == 'genus':
            if predicted_class not in nodes['tax_name'].values:
                # then add 1 to each correct count if correct
                predicted_class = large_ref_df[large_ref_df[tax_rank] == predicted_class].iloc[:,counter].to_list()[0]
#                 print('Predicted class is', predicted_class)

#                 print(ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0])
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0].split('_')[1] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
            else:
                classes = pd.read_csv('../../analysis/models/keys_%s_%s_15000.csv' % (tax_rank, predicted_class), header=None)
                classes.columns = ['predict','pred_name']
#                 print('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, predicted_class))
                model = load_model('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, predicted_class))
                scores = model.predict(np.expand_dims(X_test,2))
                predicts = model.predict_classes(np.expand_dims(X_test,2))
                predicted_class = classes[classes['predict'] == predicts[0]]['pred_name'].to_list()[0]
#                 print('Predicted class is', predicted_class)

#                 print(ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0])
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0].split('_')[1] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
        else:
            if predicted_class not in nodes['tax_name'].values:
                # then add 1 to each correct count if correct
#                 print(large_ref_df[large_ref_df[tax_rank] == predicted_class].iloc[:,counter].to_list())
#                 print(predicted_class)
                predicted_class = large_ref_df[large_ref_df[tax_rank] == predicted_class].iloc[:,counter].to_list()[0]
#                 print('Predicted class is', predicted_class)

#                 print(ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0])
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
            else:
                classes = pd.read_csv('../../analysis/models/keys_%s_%s_15000.csv' % (tax_rank, predicted_class), header=None)
                classes.columns = ['predict','pred_name']
#                 print('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, predicted_class))
                model = load_model('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, predicted_class))
                scores = model.predict(np.expand_dims(X_test,2))
                predicts = model.predict_classes(np.expand_dims(X_test,2))
                predicted_class = classes[classes['predict'] == predicts[0]]['pred_name'].to_list()[0]
#                 print('Predicted class is', predicted_class)

#                 print(ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0])
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
#     print(all_values_dict[species_name][1])
    if (i+1) % 200 == 0:
        print(all_values_dict[species_name][1])
        with open('/media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/ML_results/%s.json' % species_name, 'w+') as fp:
            json.dump(all_values_dict[species_name][1], fp)

1/2600
aspergillus_flavus
2/2600
3/2600
4/2600
5/2600
6/2600
7/2600
8/2600
9/2600
10/2600
11/2600
12/2600
13/2600
14/2600
15/2600
16/2600
17/2600
18/2600
19/2600
20/2600
21/2600
22/2600
23/2600
24/2600
25/2600
26/2600
27/2600
28/2600
29/2600
30/2600
31/2600
32/2600
33/2600
34/2600
35/2600
36/2600
37/2600
38/2600
39/2600
40/2600
41/2600
42/2600
43/2600
44/2600
45/2600
46/2600
47/2600
48/2600
49/2600
50/2600
51/2600
52/2600
53/2600
54/2600
55/2600
56/2600
57/2600
58/2600
59/2600
60/2600
61/2600
62/2600
63/2600
64/2600
65/2600
66/2600
67/2600
68/2600
69/2600
70/2600
71/2600
72/2600
73/2600
74/2600
75/2600
76/2600
77/2600
78/2600
79/2600
80/2600
81/2600
82/2600
83/2600
84/2600
85/2600
86/2600
87/2600
88/2600
89/2600
90/2600
91/2600
92/2600
93/2600
94/2600
95/2600
96/2600
97/2600
98/2600
99/2600
100/2600
101/2600
102/2600
103/2600
104/2600
105/2600
106/2600
107/2600
108/2600
109/2600
110/2600
111/2600
112/2600
113/2600
114/2600
115/2600
116/2600
117/2600
118/2600
119/2600
120/2600
121/2600


835/2600
836/2600
837/2600
838/2600
839/2600
840/2600
841/2600
842/2600
843/2600
844/2600
845/2600
846/2600
847/2600
848/2600
849/2600
850/2600
851/2600
852/2600
853/2600
854/2600
855/2600
856/2600
857/2600
858/2600
859/2600
860/2600
861/2600
862/2600
863/2600
864/2600
865/2600
866/2600
867/2600
868/2600
869/2600
870/2600
871/2600
872/2600
873/2600
874/2600
875/2600
876/2600
877/2600
878/2600
879/2600
880/2600
881/2600
882/2600
883/2600
884/2600
885/2600
886/2600
887/2600
888/2600
889/2600
890/2600
891/2600
892/2600
893/2600
894/2600
895/2600
896/2600
897/2600
898/2600
899/2600
900/2600
901/2600
902/2600
903/2600
904/2600
905/2600
906/2600
907/2600
908/2600
909/2600
910/2600
911/2600
912/2600
913/2600
914/2600
915/2600
916/2600
917/2600
918/2600
919/2600
920/2600
921/2600
922/2600
923/2600
924/2600
925/2600
926/2600
927/2600
928/2600
929/2600
930/2600
931/2600
932/2600
933/2600
934/2600
935/2600
936/2600
937/2600
938/2600
939/2600
940/2600
941/2600
942/2600
943/2600
944/2600
945/2600
9

1602/2600
1603/2600
1604/2600
1605/2600
1606/2600
1607/2600
1608/2600
1609/2600
1610/2600
1611/2600
1612/2600
1613/2600
1614/2600
1615/2600
1616/2600
1617/2600
1618/2600
1619/2600
1620/2600
1621/2600
1622/2600
1623/2600
1624/2600
1625/2600
1626/2600
1627/2600
1628/2600
1629/2600
1630/2600
1631/2600
1632/2600
1633/2600
1634/2600
1635/2600
1636/2600
1637/2600
1638/2600
1639/2600
1640/2600
1641/2600
1642/2600
1643/2600
1644/2600
1645/2600
1646/2600
1647/2600
1648/2600
1649/2600
1650/2600
1651/2600
1652/2600
1653/2600
1654/2600
1655/2600
1656/2600
1657/2600
1658/2600
1659/2600
1660/2600
1661/2600
1662/2600
1663/2600
1664/2600
1665/2600
1666/2600
1667/2600
1668/2600
1669/2600
1670/2600
1671/2600
1672/2600
1673/2600
1674/2600
1675/2600
1676/2600
1677/2600
1678/2600
1679/2600
1680/2600
1681/2600
1682/2600
1683/2600
1684/2600
1685/2600
1686/2600
1687/2600
1688/2600
1689/2600
1690/2600
1691/2600
1692/2600
1693/2600
1694/2600
1695/2600
1696/2600
1697/2600
1698/2600
1699/2600
1700/2600
1701/2600


2362/2600
2363/2600
2364/2600
2365/2600
2366/2600
2367/2600
2368/2600
2369/2600
2370/2600
2371/2600
2372/2600
2373/2600
2374/2600
2375/2600
2376/2600
2377/2600
2378/2600
2379/2600
2380/2600
2381/2600
2382/2600
2383/2600
2384/2600
2385/2600
2386/2600
2387/2600
2388/2600
2389/2600
2390/2600
2391/2600
2392/2600
2393/2600
2394/2600
2395/2600
2396/2600
2397/2600
2398/2600
2399/2600
2400/2600
{'k': 1.0000000000000007, 'p': 0.9950000000000008, 'c': 0.9600000000000007, 'o': 0.9600000000000007, 'f': 0.9600000000000007, 'g': 0.9600000000000007, 's': 0.9600000000000007}
2401/2600
zymoseptoria_tritici
2402/2600
2403/2600
2404/2600
2405/2600
2406/2600
2407/2600
2408/2600
2409/2600
2410/2600
2411/2600
2412/2600
2413/2600
2414/2600
2415/2600
2416/2600
2417/2600
2418/2600
2419/2600
2420/2600
2421/2600
2422/2600
2423/2600
2424/2600
2425/2600
2426/2600
2427/2600
2428/2600
2429/2600
2430/2600
2431/2600
2432/2600
2433/2600
2434/2600
2435/2600
2436/2600
2437/2600
2438/2600
2439/2600
2440/2600
2441/2600
244

### Wheat

In [3]:
%%time

import pandas as pd
from Bio import SeqIO
import numpy as np
import json
import os
import random
import argparse
import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Conv1D, Dropout, MaxPooling1D, Flatten
from keras.utils import plot_model, to_categorical
import math
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,precision_score,recall_score,f1_score

def getquery_taxfileid(ref_df, species):
    """
    Takes the reference dataframe filename and the species name.
    Returns the taxfileid, which is the date/flowcellid (column 0 value) of the ref_df.
    """
    return ref_df[ref_df.species == species].iloc[:,0].values[0]

def get_taxid_dict(taxid_fn, taxfileid):
    """
    Takes a taxonomy assignment file filename in the Qiime format and a taxonomic identifier.
    Returns the a dictionary with the taxonomic assignment at each rank.
    """
    tax_dict = {}
    with open(taxid_fn, 'r') as fh:
        for line in fh:
            if line.startswith(taxfileid):
                taxrankids = line.rstrip().split('\t')[1].split(';')
                for taxrank in taxrankids:
                    tax_dict[taxrank.split('__')[0]] = taxrank.split('__')[1]
    return tax_dict

def numberfy(SeqIO_dict, seq_len, nsubsample, species_name):
    """
    Take SeqIO_dict and return SeqIO_dict were bases have been replaced
    with numbers
    ACGT- replaced with 01234
    Take the seq_len each sequence should have
    """
    num_dict = {}
    
    randkeys = [SeqIO_dict.id]
#     print(randkeys)
    
    for key in randkeys:
        seq = str(SeqIO_dict.seq).replace("A",'0 ')\
        .replace("C",'1 ').replace("G",'2 ').replace("T",'3 ')\
        .replace("a",'0 ').replace("c",'1 ').replace("g",'2 ')\
        .replace("t",'3 ')
#         seq_new = seq + '4 '*(seq_len - int(len(seq)/2))
        seq_new = seq + '4 '*(5000 - int(len(seq)/2))
        if seq_new.find('t') != -1:
            print(seq_new.find('t'))
            print("ERROR - strange value in sequence")
            print(seq_new)
            exit()
        num_dict[key] = list(map(int, seq_new.split(' ')[:-1]))
    return num_dict

wheat_taxonomy_file_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/wheat_taxonomy_file_qiime.csv')
tax_ranks = ['kingdom',
             'phylum',
             'class', 'order', 'family', 'genus'
            ]
nodes = pd.read_csv('../../analysis/Stats/nodes.csv', sep=' ', header=None)
nodes.columns = ['tax_rank','tax_name']
ref_df = pd.read_csv('../../analysis/Stats/wheat_reference_dataframe_v2.csv', index_col=None)
large_ref_df = pd.read_csv('../../analysis/Stats/large_wheat_reference_dataframe.csv', index_col=None)
full_wheat_dict = SeqIO.to_dict(SeqIO.parse("../../analysis/Mapping/wheat/subsample_reads/wheat_community_200.fasta", "fasta"))
n_per_species = 200.

species_list = []
for key in full_wheat_dict:
    if full_wheat_dict[key].description.split(' ')[1] not in species_list:
        species_list.append(full_wheat_dict[key].description.split(' ')[1])
all_values_dict = {}

for species in species_list:
    taxfileid = getquery_taxfileid(ref_df, species)
    query_tax_dict = get_taxid_dict(wheat_taxonomy_file_fn, taxfileid)
    all_values_dict[species] = query_tax_dict,{'k': 0, 'p': 0, 'c': 0, 'o': 0, 'f': 0, 'g': 0, 's': 0}
# print(all_values_dict)
        
    
    
for i in range(0, len(full_wheat_dict.keys())):
    print('%s/%s' % (i,len(full_wheat_dict.keys())))
    key = list(full_wheat_dict.keys())[i]
    species_name = full_wheat_dict[key].description.split(' ')[1]
    if i == 0 or i % 200 == 0:
        print(species_name)
    max_len = len(full_wheat_dict[key].seq)

    # convert base pair coding to numerical coding and 
    # pad to the max sequence length
    n_reads = 1
    
    numSeqIO_dicts = {}
    numSeqIO_dicts[key] = numberfy(full_wheat_dict[key], max_len, n_reads, species_name)
    seq_list = []
    for key in numSeqIO_dicts.keys():
        seq_list.append(np.array(list(numSeqIO_dicts[key].values())))
#     print(seq_list)

    all_data = np.concatenate(seq_list)
    num_class = len(numSeqIO_dicts)

#     print('all_data.shape:', all_data.shape)
    
    samples_count = n_reads*num_class

    # # Assign a percentage of data for training and the rest for testing
    train_size = math.floor(1*all_data.shape[0])

    # # Define the data vs labels for each of the training and test sets
    X_test = all_data[:,:]

#     print('X_test.shape : ', X_test.shape)

#     print(ref_df[ref_df['species'] == species_name]['kingdom'].to_list()[0])
    all_values_dict[species_name][1]['k'] = all_values_dict[species_name][1]['k'] = 1.
    
    counter = 6
    for tax_rank in tax_ranks:
#         print('tax_rank =', tax_rank)
        if tax_rank == 'kingdom':
            model = load_model('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, 'fungi'))
            classes = pd.read_csv('../../analysis/models/keys_%s_%s_15000.csv' % (tax_rank, 'fungi'), header=None)
            classes.columns = ['predict','pred_name']
            scores = model.predict(np.expand_dims(X_test,2))
            predicts = model.predict_classes(np.expand_dims(X_test,2))
            predicted_class = classes[classes['predict'] == predicts[0]]['pred_name'].to_list()[0]
            print('Predicted class is', predicted_class)

            print(ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0])
            if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0] == predicted_class:
                all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
            counter -= 1
            keras.backend.clear_session()
        elif tax_rank == 'genus':
            if predicted_class not in nodes['tax_name'].values:
                # then add 1 to each correct count if correct
                predicted_class = large_ref_df[large_ref_df[tax_rank] == predicted_class].iloc[:,counter].to_list()[0]
                print('Predicted class is', predicted_class)

                print(ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0])
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0].split('_')[1] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
            else:
                classes = pd.read_csv('../../analysis/models/keys_%s_%s_15000.csv' % (tax_rank, predicted_class), header=None)
                classes.columns = ['predict','pred_name']
#                 print('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, predicted_class))
                model = load_model('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, predicted_class))
                scores = model.predict(np.expand_dims(X_test,2))
                predicts = model.predict_classes(np.expand_dims(X_test,2))
                predicted_class = classes[classes['predict'] == predicts[0]]['pred_name'].to_list()[0]
                print('Predicted class is', predicted_class)

                print(ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0])
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0].split('_')[1] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
        else:
            if predicted_class not in nodes['tax_name'].values:
                # then add 1 to each correct count if correct
#                 print(large_ref_df[large_ref_df[tax_rank] == predicted_class].iloc[:,counter].to_list())
#                 print(predicted_class)
                predicted_class = large_ref_df[large_ref_df[tax_rank] == predicted_class].iloc[:,counter].to_list()[0]
                print('Predicted class is', predicted_class)

                print(ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0])
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
            else:
                classes = pd.read_csv('../../analysis/models/keys_%s_%s_15000.csv' % (tax_rank, predicted_class), header=None)
                classes.columns = ['predict','pred_name']
#                 print('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, predicted_class))
                model = load_model('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, predicted_class))
                scores = model.predict(np.expand_dims(X_test,2))
                predicts = model.predict_classes(np.expand_dims(X_test,2))
                predicted_class = classes[classes['predict'] == predicts[0]]['pred_name'].to_list()[0]
                print('Predicted class is', predicted_class)

                print(ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0])
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
    if (i+1) % 200 == 0:
        print(all_values_dict[species_name][1])
        with open('/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/ML_results/%s.json' % species_name, 'w+') as fp:
            json.dump(all_values_dict[species_name][1], fp)

0/600
puccinia_striiformis
Predicted class is ascomycota
basidiomycota
Predicted class is dothideomycetes
pucciniomycetes
Predicted class is capnodiales
pucciniales
Predicted class is mycosphaerellaceae
pucciniaceae
Predicted class is zymoseptoria
puccinia
Predicted class is tritici
puccinia_striiformis
1/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniales
Predicted class is aspergillaceae
pucciniaceae
Predicted class is penicillium
puccinia
Predicted class is chrysogenum
puccinia_striiformis
2/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniales
Predicted class is aspergillaceae
pucciniaceae
Predicted class is aspergillus
puccinia
Predicted class is niger
puccinia_striiformis
3/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
puccinial

Predicted class is aspergillus
puccinia
Predicted class is unidentified
puccinia_striiformis
29/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is chaetothyriales
pucciniales
Predicted class is herpotrichiellaceae
pucciniaceae
Predicted class is cladophialophora
puccinia
Predicted class is unidentified
puccinia_striiformis
30/600
Predicted class is ascomycota
basidiomycota
Predicted class is dothideomycetes
pucciniomycetes
Predicted class is capnodiales
pucciniales
Predicted class is mycosphaerellaceae
pucciniaceae
Predicted class is zymoseptoria
puccinia
Predicted class is tritici
puccinia_striiformis
31/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is chaetothyriales
pucciniales
Predicted class is herpotrichiellaceae
pucciniaceae
Predicted class is cladophialophora
puccinia
Predicted class is unidentified
puccinia_striiformis
32/600
Predicted class is 

Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniales
Predicted class is aspergillaceae
pucciniaceae
Predicted class is aspergillus
puccinia
Predicted class is unidentified
puccinia_striiformis
59/600
Predicted class is ascomycota
basidiomycota
Predicted class is dothideomycetes
pucciniomycetes
Predicted class is capnodiales
pucciniales
Predicted class is mycosphaerellaceae
pucciniaceae
Predicted class is zymoseptoria
puccinia
Predicted class is tritici
puccinia_striiformis
60/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniales
Predicted class is aspergillaceae
pucciniaceae
Predicted class is penicillium
puccinia
Predicted class is chrysogenum
puccinia_striiformis
61/600
Predicted class is ascomycota
basidiomycota
Predicted class is dothideomycetes
pucciniomycetes
Predicted class is capnodiales
pucciniales
Predicted cl

Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniales
Predicted class is aspergillaceae
pucciniaceae
Predicted class is aspergillus
puccinia
Predicted class is unidentified
puccinia_striiformis
88/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniales
Predicted class is aspergillaceae
pucciniaceae
Predicted class is aspergillus
puccinia
Predicted class is unidentified
puccinia_striiformis
89/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniales
Predicted class is aspergillaceae
pucciniaceae
Predicted class is aspergillus
puccinia
Predicted class is unidentified
puccinia_striiformis
90/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniales
Predicted class

Predicted class is basidiomycota
basidiomycota
Predicted class is exobasidiomycetes
pucciniomycetes
Predicted class is microstromatales
pucciniales
Predicted class is quambalariaceae
pucciniaceae
Predicted class is quambalaria
puccinia
Predicted class is cyanescens
puccinia_striiformis
117/600
Predicted class is basidiomycota
basidiomycota
Predicted class is agaricomycetes
pucciniomycetes
Predicted class is agaricales
pucciniales
Predicted class is cortinariaceae
pucciniaceae
Predicted class is cortinarius
puccinia
Predicted class is globuliformis
puccinia_striiformis
118/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniales
Predicted class is aspergillaceae
pucciniaceae
Predicted class is penicillium
puccinia
Predicted class is chrysogenum
puccinia_striiformis
119/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniale

Predicted class is ascomycota
basidiomycota
Predicted class is dothideomycetes
pucciniomycetes
Predicted class is capnodiales
pucciniales
Predicted class is mycosphaerellaceae
pucciniaceae
Predicted class is zymoseptoria
puccinia
Predicted class is tritici
puccinia_striiformis
146/600
Predicted class is ascomycota
basidiomycota
Predicted class is leotiomycetes
pucciniomycetes
Predicted class is helotiales
pucciniales
Predicted class is dermateaceae
pucciniaceae
Predicted class is oculimacula
puccinia
Predicted class is yallundae-ccl031
puccinia_striiformis
147/600
Predicted class is ascomycota
basidiomycota
Predicted class is dothideomycetes
pucciniomycetes
Predicted class is capnodiales
pucciniales
Predicted class is mycosphaerellaceae
pucciniaceae
Predicted class is zymoseptoria
puccinia
Predicted class is tritici
puccinia_striiformis
148/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is chaetothyriales
pucciniales
Pr

Predicted class is unidentified
puccinia_striiformis
174/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is chaetothyriales
pucciniales
Predicted class is herpotrichiellaceae
pucciniaceae
Predicted class is cladophialophora
puccinia
Predicted class is unidentified
puccinia_striiformis
175/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniales
Predicted class is aspergillaceae
pucciniaceae
Predicted class is penicillium
puccinia
Predicted class is chrysogenum
puccinia_striiformis
176/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pucciniomycetes
Predicted class is eurotiales
pucciniales
Predicted class is aspergillaceae
pucciniaceae
Predicted class is aspergillus
puccinia
Predicted class is niger
puccinia_striiformis
177/600
Predicted class is ascomycota
basidiomycota
Predicted class is eurotiomycetes
pu

Predicted class is eurotiales
capnodiales
Predicted class is aspergillaceae
mycosphaerellaceae
Predicted class is aspergillus
zymoseptoria
Predicted class is unidentified
zymoseptoria_tritici
203/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
capnodiales
Predicted class is aspergillaceae
mycosphaerellaceae
Predicted class is aspergillus
zymoseptoria
Predicted class is unidentified
zymoseptoria_tritici
204/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymoseptoria
zymoseptoria
Predicted class is tritici
zymoseptoria_tritici
205/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymos

Predicted class is eurotiales
capnodiales
Predicted class is aspergillaceae
mycosphaerellaceae
Predicted class is aspergillus
zymoseptoria
Predicted class is unidentified
zymoseptoria_tritici
231/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
capnodiales
Predicted class is aspergillaceae
mycosphaerellaceae
Predicted class is aspergillus
zymoseptoria
Predicted class is unidentified
zymoseptoria_tritici
232/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymoseptoria
zymoseptoria
Predicted class is tritici
zymoseptoria_tritici
233/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymos

Predicted class is ascomycota
ascomycota
Predicted class is saccharomycetes
dothideomycetes
Predicted class is saccharomycetales
capnodiales
Predicted class is metschnikowiaceae
mycosphaerellaceae
Predicted class is kodamaea
zymoseptoria
Predicted class is ohmeri
zymoseptoria_tritici
260/600
Predicted class is basidiomycota
ascomycota
Predicted class is agaricomycetes
dothideomycetes
Predicted class is agaricales
capnodiales
Predicted class is cortinariaceae
mycosphaerellaceae
Predicted class is cortinarius
zymoseptoria
Predicted class is globuliformis
zymoseptoria_tritici
261/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymoseptoria
zymoseptoria
Predicted class is tritici
zymoseptoria_tritici
262/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodia

Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymoseptoria
zymoseptoria
Predicted class is tritici
zymoseptoria_tritici
288/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
capnodiales
Predicted class is aspergillaceae
mycosphaerellaceae
Predicted class is aspergillus
zymoseptoria
Predicted class is unidentified
zymoseptoria_tritici
289/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymoseptoria
zymoseptoria
Predicted class is tritici
zymoseptoria_tritici
290/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales


Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
capnodiales
Predicted class is aspergillaceae
mycosphaerellaceae
Predicted class is aspergillus
zymoseptoria
Predicted class is unidentified
zymoseptoria_tritici
316/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymoseptoria
zymoseptoria
Predicted class is tritici
zymoseptoria_tritici
317/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
capnodiales
Predicted class is aspergillaceae
mycosphaerellaceae
Predicted class is aspergillus
zymoseptoria
Predicted class is unidentified
zymoseptoria_tritici
318/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
capn

Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymoseptoria
zymoseptoria
Predicted class is tritici
zymoseptoria_tritici
344/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymoseptoria
zymoseptoria
Predicted class is tritici
zymoseptoria_tritici
345/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymoseptoria
zymoseptoria
Predicted class is tritici
zymoseptoria_tritici
346/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiale

Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
capnodiales
Predicted class is pleosporaceae
mycosphaerellaceae
Predicted class is pyrenophora
zymoseptoria
Predicted class is tritici-repentis
zymoseptoria_tritici
372/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
capnodiales
Predicted class is mycosphaerellaceae
mycosphaerellaceae
Predicted class is zymoseptoria
zymoseptoria
Predicted class is tritici
zymoseptoria_tritici
373/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
capnodiales
Predicted class is aspergillaceae
mycosphaerellaceae
Predicted class is aspergillus
zymoseptoria
Predicted class is unidentified
zymoseptoria_tritici
374/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiale

Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
capnodiales
Predicted class is aspergillaceae
mycosphaerellaceae
Predicted class is aspergillus
zymoseptoria
Predicted class is unidentified
zymoseptoria_tritici
{'k': 1.0, 'p': 0.9750000000000008, 'c': 0.5700000000000004, 'o': 0.5400000000000004, 'f': 0.5400000000000004, 'g': 0.5400000000000004, 's': 0.5400000000000004}
400/600
pyrenophora_tritici-repentis
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
pleosporales
Predicted class is aspergillaceae
pleosporaceae
Predicted class is aspergillus
pyrenophora
Predicted class is niger
pyrenophora_tritici-repentis
401/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
pleosporales
Predicted class is pleosporaceae
pleosporaceae
Predicted class is pyrenophora
pyrenophora
Predi

Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
pleosporales
Predicted class is aspergillaceae
pleosporaceae
Predicted class is aspergillus
pyrenophora
Predicted class is niger
pyrenophora_tritici-repentis
427/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
pleosporales
Predicted class is pleosporaceae
pleosporaceae
Predicted class is pyrenophora
pyrenophora
Predicted class is tritici-repentis
pyrenophora_tritici-repentis
428/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
pleosporales
Predicted class is pleosporaceae
pleosporaceae
Predicted class is pyrenophora
pyrenophora
Predicted class is tritici-repentis
pyrenophora_tritici-repentis
429/600
Predicted class is basidiomycota
ascomycota
Predicted class is agaricomycetes
dothideomycetes
Predicted class is agaricales
pleosporales
Predicted class i

Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
pleosporales
Predicted class is pleosporaceae
pleosporaceae
Predicted class is pyrenophora
pyrenophora
Predicted class is tritici-repentis
pyrenophora_tritici-repentis
455/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
pleosporales
Predicted class is pleosporaceae
pleosporaceae
Predicted class is pyrenophora
pyrenophora
Predicted class is tritici-repentis
pyrenophora_tritici-repentis
456/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
pleosporales
Predicted class is mycosphaerellaceae
pleosporaceae
Predicted class is zymoseptoria
pyrenophora
Predicted class is tritici
pyrenophora_tritici-repentis
457/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is chaetothyriales
pleosporales
Predic

Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
pleosporales
Predicted class is aspergillaceae
pleosporaceae
Predicted class is aspergillus
pyrenophora
Predicted class is niger
pyrenophora_tritici-repentis
483/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
pleosporales
Predicted class is aspergillaceae
pleosporaceae
Predicted class is penicillium
pyrenophora
Predicted class is chrysogenum
pyrenophora_tritici-repentis
484/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
pleosporales
Predicted class is aspergillaceae
pleosporaceae
Predicted class is aspergillus
pyrenophora
Predicted class is niger
pyrenophora_tritici-repentis
485/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
pleosporales

Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
pleosporales
Predicted class is pleosporaceae
pleosporaceae
Predicted class is pyrenophora
pyrenophora
Predicted class is tritici-repentis
pyrenophora_tritici-repentis
511/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
pleosporales
Predicted class is pleosporaceae
pleosporaceae
Predicted class is pyrenophora
pyrenophora
Predicted class is tritici-repentis
pyrenophora_tritici-repentis
512/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is botryosphaeriales
pleosporales
Predicted class is botryosphaeriaceae
pleosporaceae
Predicted class is dothiorella
pyrenophora
Predicted class is vidmadera
pyrenophora_tritici-repentis
513/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predi

Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is capnodiales
pleosporales
Predicted class is mycosphaerellaceae
pleosporaceae
Predicted class is zymoseptoria
pyrenophora
Predicted class is tritici
pyrenophora_tritici-repentis
539/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
pleosporales
Predicted class is pleosporaceae
pleosporaceae
Predicted class is pyrenophora
pyrenophora
Predicted class is tritici-repentis
pyrenophora_tritici-repentis
540/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is chaetothyriales
pleosporales
Predicted class is herpotrichiellaceae
pleosporaceae
Predicted class is cladophialophora
pyrenophora
Predicted class is unidentified
pyrenophora_tritici-repentis
541/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Pre

Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
pleosporales
Predicted class is aspergillaceae
pleosporaceae
Predicted class is aspergillus
pyrenophora
Predicted class is unidentified
pyrenophora_tritici-repentis
567/600
Predicted class is ascomycota
ascomycota
Predicted class is eurotiomycetes
dothideomycetes
Predicted class is eurotiales
pleosporales
Predicted class is aspergillaceae
pleosporaceae
Predicted class is aspergillus
pyrenophora
Predicted class is niger
pyrenophora_tritici-repentis
568/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
pleosporales
Predicted class is pleosporaceae
pleosporaceae
Predicted class is pyrenophora
pyrenophora
Predicted class is tritici-repentis
pyrenophora_tritici-repentis
569/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleospor

Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
pleosporales
Predicted class is pleosporaceae
pleosporaceae
Predicted class is pyrenophora
pyrenophora
Predicted class is tritici-repentis
pyrenophora_tritici-repentis
595/600
Predicted class is ascomycota
ascomycota
Predicted class is saccharomycetes
dothideomycetes
Predicted class is saccharomycetales
pleosporales
Predicted class is debaryomycetaceae
pleosporaceae
Predicted class is meyerozyma
pyrenophora
Predicted class is guilliermondii
pyrenophora_tritici-repentis
596/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Predicted class is pleosporales
pleosporales
Predicted class is pleosporaceae
pleosporaceae
Predicted class is pyrenophora
pyrenophora
Predicted class is tritici-repentis
pyrenophora_tritici-repentis
597/600
Predicted class is ascomycota
ascomycota
Predicted class is dothideomycetes
dothideomycetes
Pr