### Application of Machine Learning to Synthetic Mock Community (Validation Dataset)

In [None]:
## REQUIRED PACKAGES
# argparse, Biopython, json, keras, math, matplotlib, numpy, os, pandas, random, scikit-learn

import pandas as pd
from Bio import SeqIO
import numpy as np
import json
import os
import random
import argparse
import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Conv1D, Dropout, MaxPooling1D, Flatten
from keras.utils.vis_utils import plot_model

import math
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score,precision_score,recall_score,f1_score

# Extract the reference for the taxa
def getquery_taxfileid(ref_df, species):
    """
    Takes the reference dataframe filename and the species name.
    Returns the taxfileid, which is the date/flowcellid (column 0 value) of the ref_df.
    """
    return ref_df[ref_df.species == species].iloc[:,0].values[0]

# Returns the taxonomic assignment at each rank as the expected output for comparison to the decision tree's predicted output
def get_taxid_dict(taxid_fn, taxfileid):
    """
    Takes a taxonomy assignment file filename in the Qiime format and a taxonomic identifier.
    Returns the a dictionary with the taxonomic assignment at each rank.
    """
    tax_dict = {}
    with open(taxid_fn, 'r') as fh:
        for line in fh:
            if line.startswith(taxfileid):
                taxrankids = line.rstrip().split('\t')[1].split(';')
                for taxrank in taxrankids:
                    tax_dict[taxrank.split('__')[0]] = taxrank.split('__')[1]
    return tax_dict

# Convert nucleic data to numerical data and pad sequence to pad_length for use with the validation dataset
def numberfy(SeqIO_dict, seq_len, nsubsample, species_name, pad_length):
    """
    Take SeqIO_dict and return SeqIO_dict were bases have been replaced
    with numbers
    ACGT- replaced with 01234
    Take the seq_len each sequence should have
    """
    num_dict = {}
    
    randkeys = [SeqIO_dict.id]
#     print(randkeys)
    
    for key in randkeys:
        seq = str(SeqIO_dict.seq).replace("A",'0 ')\
        .replace("C",'1 ').replace("G",'2 ').replace("T",'3 ')\
        .replace("a",'0 ').replace("c",'1 ').replace("g",'2 ')\
        .replace("t",'3 ')
#         seq_new = seq + '4 '*(seq_len - int(len(seq)/2))
        seq_new = seq + '4 '*(pad_length - int(len(seq)/2))
        if seq_new.find('t') != -1:
            print(seq_new.find('t'))
            print("ERROR - strange value in sequence")
            print(seq_new)
            exit()
        num_dict[key] = list(map(int, seq_new.split(' ')[:-1]))
    return num_dict



## Load the required files
# File with the taxonomy for each taxa in the mock community 
mock_taxonomy_file_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/mock_taxonomy_file_qiime.csv')
tax_ranks = ['kingdom',
             'phylum',
             'class', 'order', 'family', 'genus'
            ]
# List of nodes where models are located
nodes = pd.read_csv('../../analysis/Stats/nodes.csv', sep=' ', header=None)
nodes.columns = ['tax_rank','tax_name']
# Reference dataframes and dictionaries containing taxonomic information
ref_df = pd.read_csv('../../analysis/Stats/mock_reference_dataframe.csv', index_col=None)
large_ref_df = pd.read_csv('../../analysis/Stats/large_mock_reference_dataframe.csv', index_col=None)
full_mock_dict = SeqIO.to_dict(SeqIO.parse("../../analysis/Mapping/mock/subsample_reads/mock_community_1000.fasta", "fasta"))
# Define variable numbers
n_per_species = 1000.
pad_length = 5000

# Define the species in mock community and assign them a dictionary of taxonomic counters
species_list = []
for key in full_mock_dict:
    if full_mock_dict[key].description.split(' ')[1] not in species_list:
        species_list.append(full_mock_dict[key].description.split(' ')[1])
all_values_dict = {}

for species in species_list:
    taxfileid = getquery_taxfileid(ref_df, species)
    query_tax_dict = get_taxid_dict(mock_taxonomy_file_fn, taxfileid)
    all_values_dict[species] = query_tax_dict,{'k': 0, 'p': 0, 'c': 0, 'o': 0, 'f': 0, 'g': 0, 's': 0}
# print(all_values_dict)
        
    
# For each input read, run through the following:
#    Convert nucleic data to numeric data and pad to the sequence length
#    Define the data as the validation set and define the expected labels
#    For each taxonomic rank, load the appropriate model and input the read to the model. If the tax rank is not a node, pass to the next tax rank
#    Compare the predicted and expected outputs. If they match, add 1/n_per_species to the counter for that taxa for the specific taxonomic rank
#    This counts the proportion of predicted outputs that match the expected output at each taxonomic rank
# Once all reads for a specific taxa have been assessed, save the validation accuracy as a json file


for i in range(0, len(full_mock_dict.keys())):
    # Print progress markers for clarity of display
    print('%s/%s' % (i+1,len(full_mock_dict.keys())))
    key = list(full_mock_dict.keys())[i]
    species_name = full_mock_dict[key].description.split(' ')[1]
    if i == 0 or i % n_per_species == 0:
        print(species_name)
    max_len = len(full_mock_dict[key].seq)

    # convert base pair coding to numerical coding and 
    # pad to the max sequence length
    n_reads = 1
    
    numSeqIO_dicts = {}
    numSeqIO_dicts[key] = numberfy(full_mock_dict[key], max_len, n_reads, species_name, pad_length)
    seq_list = []
    for key in numSeqIO_dicts.keys():
        seq_list.append(np.array(list(numSeqIO_dicts[key].values())))

    all_data = np.concatenate(seq_list)
    num_class = len(numSeqIO_dicts)

    
    samples_count = n_reads*num_class

    
    
    # # Assign all reads as part of the validation set
    valid_size = math.floor(1*all_data.shape[0])

    # # Define the data vs labels for the validaton set
    X_valid = all_data[:,:]

    # As all taxa belong to fungal kingdom, add 1/n_per_species to kingdom counter
    all_values_dict[species_name][1]['k'] += 1./n_per_species
    
    # Define a counter to count down the number of taxonomic ranks to assess
    counter = 6
    for tax_rank in tax_ranks:
        if tax_rank == 'kingdom':
            # Load the model and output classes, predict the output and compare the predicted output to the expected output
            model = load_model('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, 'fungi'))
            classes = pd.read_csv('../../analysis/models/keys_%s_%s_15000.csv' % (tax_rank, 'fungi'), header=None)
            classes.columns = ['predict','pred_name']
            scores = model.predict(np.expand_dims(X_valid,2))
            predicts = model.predict_classes(np.expand_dims(X_valid,2))
            predicted_class = classes[classes['predict'] == predicts[0]]['pred_name'].to_list()[0]
            # If the predicted output and expected output match, add 1/n_per_species to the counter for this taxonomic rank for this taxa
            if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0] == predicted_class:
                all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
            counter -= 1
            keras.backend.clear_session()
            
        elif tax_rank == 'genus':
            if predicted_class not in nodes['tax_name'].values:
                # then add 1/n_per_species to each correct count if correct
                predicted_class = large_ref_df[large_ref_df[tax_rank] == predicted_class].iloc[:,counter].to_list()[0]
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0].split('_')[1] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
            else:
                # Load the model and output classes, predict the output and compare the predicted output to the expected output
                classes = pd.read_csv('../../analysis/models/keys_%s_%s_15000.csv' % (tax_rank, predicted_class), header=None)
                classes.columns = ['predict','pred_name']
                model = load_model('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, predicted_class))
                scores = model.predict(np.expand_dims(X_valid,2))
                predicts = model.predict_classes(np.expand_dims(X_valid,2))
                predicted_class = classes[classes['predict'] == predicts[0]]['pred_name'].to_list()[0]
                # If the predicted output and expected output match, add 1/n_per_species to the counter for this taxonomic rank for this taxa
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0].split('_')[1] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
                
        else:
            if predicted_class not in nodes['tax_name'].values:
                # then add 1/n_per_species to each correct count if correct
                predicted_class = large_ref_df[large_ref_df[tax_rank] == predicted_class].iloc[:,counter].to_list()[0]
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
            else:
                # Load the model and output classes, predict the output and compare the predicted output to the expected output
                classes = pd.read_csv('../../analysis/models/keys_%s_%s_15000.csv' % (tax_rank, predicted_class), header=None)
                classes.columns = ['predict','pred_name']
#                 print('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, predicted_class))
                model = load_model('../../analysis/models/model_%s_%s_15000.h5' % (tax_rank, predicted_class))
                scores = model.predict(np.expand_dims(X_valid,2))
                predicts = model.predict_classes(np.expand_dims(X_valid,2))
                predicted_class = classes[classes['predict'] == predicts[0]]['pred_name'].to_list()[0]
                # If the predicted output and expected output match, add 1/n_per_species to the counter for this taxonomic rank for this taxa
                if ref_df[ref_df.iloc[:,1] == species_name].iloc[:,counter].to_list()[0] == predicted_class:
                    all_values_dict[species_name][1][ref_df[ref_df.iloc[:,1] == species_name].columns[counter][0]] += 1./n_per_species
                counter -= 1
                keras.backend.clear_session()
                
                
    # If all reads for one taxa have ben assessed, print the accuracy of the model's predictions on the validation dataset at each taxonomic rank and save this as a json file
    if (i+1) % n_per_species == 0:
        print(all_values_dict[species_name][1])
        with open('/media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/ML_results/%s.json' % species_name, 'w+') as fp:
            json.dump(all_values_dict[species_name][1], fp)