## Apply to wheat species for qiime and consensus respectively

In [1]:
from Bio import SeqIO
import os
import random
import subprocess
import pandas as pd

INPUT_BASEDIR = os.path.abspath('/media/MassStorage/tmp/TE/honours')
subsampling_dir = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/subsample_reads')
mapping_dir = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/qiime_results')

wheat_reference_dataframe_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/wheat_reference_dataframe.csv')
wheat_max_custom_database_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/database/wheat_database_labelled.fasta')
wheat_taxonomy_file_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/wheat_taxonomy_file_qiime.csv')

qiime_db_fn = os.path.abspath('../../analysis/qiime2/db/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta')
qiime_tax_fn = os.path.abspath('../../analysis/qiime2/db/sh_taxonomy_qiime_ver8_dynamic_02.02.2019.txt')
threads = 10

wheat_ref_df = pd.read_csv(wheat_reference_dataframe_fn)
wheat_ref_df

Unnamed: 0.1,Unnamed: 0,species,genus,family,order,class,phylum,kingdom,# raw reads,# reads after homology filtering,# reads after length filtering,# for use,path to raw reads,path to homology filtering,path to length filtering,path for use
0,barcode01,puccinia_striiformis,puccinia,pucciniaceae,pucciniales,pucciniomycetes,basidiomycota,fungi,115052,23477,21590,21590.0,analysis/Concatenated/wheat/barcode01/merged.f...,analysis/Python_Processing/wheat/barcode01/com...,analysis/Length_Filtered/wheat/barcode01/lengt...,analysis/Length_Filtered/wheat/barcode01/lengt...
1,barcode02,zymoseptoria_tritici,zymoseptoria,mycosphaerellaceae,capnodiales,dothideomycetes,ascomycota,fungi,151005,20416,18100,18100.0,analysis/Concatenated/wheat/barcode02/merged.f...,analysis/Python_Processing/wheat/barcode02/com...,analysis/Length_Filtered/wheat/barcode02/lengt...,analysis/Length_Filtered/wheat/barcode02/lengt...
2,barcode05,pyrenophora_tritici-repentis,pyrenophora,pleosporaceae,pleosporales,dothideomycetes,ascomycota,fungi,134094,24785,22433,22433.0,analysis/Concatenated/wheat/barcode05/merged.f...,analysis/Python_Processing/wheat/barcode05/com...,analysis/Length_Filtered/wheat/barcode05/lengt...,analysis/Length_Filtered/wheat/barcode05/lengt...


##### Qiime Database

In [2]:
import json
from collections import OrderedDict

def assign_taxranks_results(mapping_df, tax_fn, ref_df_fn = False):
    """
    This function assigns the taxonomic ranks for each hit in the mapping results dataframe.
    It takes a mapping_df, taxonomy assignment file, and if required a reference dataframe filename.
    Returns the mapping dataframe with assignment. 
    """
    for tname in mapping_df.index:
        if ref_df_fn:
            tmp_taxfileid = getquery_taxfileid(ref_df_fn, tname)
        else:
            tmp_taxfileid = tname
        tmp_tax_dict = get_taxid_dict(tax_fn, tmp_taxfileid)
        for key, value in tmp_tax_dict.items():
            mapping_df.loc[tname, key] = value
    return mapping_df

def get_accuracy_dict(mapping_df, query_tax_dict):
    """
    Summarises the mapping accuracy of the mapping results at all taxonomic ranks.
    Takes the mapping_df with taxnomonic assignments and a taxnomic dictionary of the known query.
    Returns an accuracy dictionary for each taxnomic rank ['k', 'p', 'c', 'o', 'f', 'g', 's']. 
    Right now this function takes a qiime tax 
    """
    accuracy_dict = OrderedDict()
    total_count = mapping_df['count'].sum()
    for tax_rank in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        tmps_df = pd.DataFrame(data=None)
        if tax_rank == 's':
            for index, row in mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]].iterrows():
                if row['s'] == 'unidentified' and row['g'] != query_tax_dict['g']:
                    mapping_df.drop(index, axis=0, inplace=True)
                else:
                    continue
            hit_count = mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]]['count'].sum()
        else:
            hit_count = mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]]['count'].sum()
        accuracy_dict[tax_rank] = hit_count/total_count
    return accuracy_dict

def getquery_taxfileid(refdf_fn, species):
    """
    Takes the reference dataframe filename and the species name.
    Returns the taxfileid, which is the date/flowcellid (column 0 value) of the ref_df.
    """
    ref_df = pd.read_csv(refdf_fn)
    return ref_df[ref_df.species == species].iloc[:,0].values[0]

def get_taxid_dict(taxid_fn, taxfileid):
    """
    Takes a taxonomy assignment file filename in the Qiime format and a taxonomic identifier.
    Returns the a dictionary with the taxonomic assignment at each rank.
    """
    tax_dict = {}
    with open(taxid_fn, 'r') as fh:
        for line in fh:
            if line.startswith(taxfileid):
                taxrankids = line.rstrip().split('\t')[1].split(';')
                for taxrank in taxrankids:
                    tax_dict[taxrank.split('__')[0]] = taxrank.split('__')[1]
    return tax_dict

def mapping_results(fn, species):
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
    hit_series = pd.Series(sub_df.groupby('tname')['mquality'].count().tolist()/sub_df.groupby('tname')['mquality'].count().sum(),
                      sub_df.groupby('tname')['mquality'].count().index)
    hit_series.sort_values(ascending=False, inplace=True)
    print(sub_df.qseqid.unique().shape == tmp_df.qseqid.unique().shape)
    print('##########\n')
    print(F"This was the query species: {species}\n")
    print(F"These are the results:")
    print(hit_series,'\n')
    hit_series.to_json('/media/MassStorage/tmp/TE/honours/analysis/Mapping/custom_results/%s.json' % species)

def minimapmapping(fasta_fn, ref_fn, out_fn, threads):
    command = F"minimap2 -x map-ont -t {threads} {ref_fn} {fasta_fn} -o {out_fn}"
    out = subprocess.getstatusoutput(command)

def pull_mapping_results_v3(fn):
    """
    Takes a minimap2 paf and reads it in with the first 12 columns. Ignores the rest.
    Filters for each read the best hit on mquality first taking the highest value.
    Filters for each read by the number of nmatches in the second step.
    Returns a dataframe that has the tnames as index and the counts of hits as column 'count'.
    The dataframe has also the taxrank columns ['k', 'p', 'c', 'o', 'f', 'g', 's'] that are all False to start with.
    """
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
#     sub_df = sub_df[sub_df['nmatch'] == sub_df.groupby('qseqid')['nmatch'].transform(max)].reset_index(drop=True)
    hit_df = pd.DataFrame(sub_df.groupby('tname')['cscore'].count().tolist(), sub_df.groupby('tname')['cscore'].count().index, columns=['count'])
    hit_df.sort_values(by='count', ascending=False, inplace=True)
    for key in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        hit_df[key] = False
        tmp_df[key] = False
    return hit_df, tmp_df
    
def subsamplereads(in_fn, out_fn, n_reads):
    command = F'reformat.sh samplereadstarget={n_reads} in={in_fn} out={out_fn}'
    out = subprocess.getstatusoutput(command)

test_species_list = []
for entry in wheat_ref_df.species.tolist():
#     if entry[-7:] != '-ccl031' and entry[-7:] != '-ccl029':
#         test_species_list.append(entry)
#     else:
#         test_species_list.append(entry[:-7])
#         print(entry[:-7])
    test_species_list.append(entry)
    
for test_species in test_species_list:
    
    print(test_species)
    
    #subsample tests species
    fn_subsampling = {}
    test_species = [test_species]
    for x in test_species:
#         print((wheat_ref_df['species'] == x))
        fn_subsampling[x] = (wheat_ref_df[(wheat_ref_df['species'] == x) & (wheat_ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
        fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])

    sub_reads_fn = {}
    n_reads = 200
    for key, value in fn_subsampling.items():
        species = key
        in_fn = value
        out_fn = os.path.join(subsampling_dir, F'{species}.{n_reads}.fasta')
        subsamplereads(in_fn, out_fn, n_reads)
        sub_reads_fn[species] = out_fn
        
    ###Map the reads
    db_fn = qiime_db_fn
    sub_db_mapping_fn = {}
    for species, fasta_fn in sub_reads_fn.items():
        db_name = os.path.basename(db_fn).replace('.fasta', '')
        out_fn = os.path.join(mapping_dir, F"{db_name}.{species}.minimap2.paf")
        sub_db_mapping_fn[species] = out_fn
        minimapmapping(fasta_fn, db_fn, out_fn, threads)
        
    ###Test out the summary results statistic for a single mapping result
    species = test_species[0]
    mapping_results , full_results_df = pull_mapping_results_v3(sub_db_mapping_fn[species])
    mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
    taxfileid = getquery_taxfileid(wheat_reference_dataframe_fn, species)
    
    query_tax_dict = get_taxid_dict(wheat_taxonomy_file_fn, taxfileid)
    print(query_tax_dict)
    
    sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)
        
    print(json.dumps(sensitivity_dict, indent=1))
    print('\n')
    with open('/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/qiime_results/%s.json' % species, 'w+') as fp:
        json.dump(sensitivity_dict, fp)

puccinia_striiformis
{'k': 'Fungi', 'p': 'Basidiomycota', 'c': 'Pucciniomycetes', 'o': 'Pucciniales', 'f': 'Pucciniaceae', 'g': 'Puccinia', 's': 'Puccinia_striiformis'}
{
 "k": 1.0,
 "p": 0.4479638009049774,
 "c": 0.0,
 "o": 0.0,
 "f": 0.0,
 "g": 0.0,
 "s": 0.0
}


zymoseptoria_tritici
{'k': 'Fungi', 'p': 'Ascomycota', 'c': 'Dothideomycetes', 'o': 'Capnodiales', 'f': 'Mycosphaerellaceae', 'g': 'Zymoseptoria', 's': 'Zymoseptoria_tritici'}
{
 "k": 1.0,
 "p": 0.6832579185520362,
 "c": 0.20361990950226244,
 "o": 0.09049773755656108,
 "f": 0.08597285067873303,
 "g": 0.08597285067873303,
 "s": 0.0
}


pyrenophora_tritici-repentis
{'k': 'Fungi', 'p': 'Ascomycota', 'c': 'Dothideomycetes', 'o': 'Pleosporales', 'f': 'Pleosporaceae', 'g': 'Pyrenophora', 's': 'Pyrenophora_tritici-repentis'}
{
 "k": 1.0,
 "p": 0.6635514018691588,
 "c": 0.2570093457943925,
 "o": 0.2570093457943925,
 "f": 0.10747663551401869,
 "g": 0.08411214953271028,
 "s": 0.08411214953271028
}




##### Custom Database

In [3]:
### list of species in the max database
max_species = ['Puccinia_striiformis',
             'Zymoseptoria_tritici',
             'Pyrenophora_tritici-repentis',
             'Fusarium_oxysporum',
             'Tuber_brumale',
             'Cortinarius_globuliformis',
             'Aspergillus_niger',
             'Clavispora_lusitaniae',
             'Kluyveromyces_unidentified',
             'Penicillium_chrysogenum',
             'Rhodotorula_mucilaginosa',
             'Scedosporium_boydii',
             'Blastobotrys_proliferans',
             'Debaryomyces_unidentified',
             'Galactomyces_geotrichum',
             'Kodamaea_ohmeri',
             'Meyerozyma_guilliermondii',
             'Wickerhamomyces_anomalus',
             'Yamadazyma_mexicana',
             'Yamadazyma_scolyti',
             'Yarrowia_lipolytica',
             'Zygoascus_hellenicus',
             'Aspergillus_flavus',
             'Cryptococcus_zero',
             'Aspergillus_unidentified',
             'Diaporthe_CCL067',
             'Diaporthe_unidentified',
             'Oculimacula_yallundae-CCL031',
             'Oculimacula_yallundae-CCL029',
             'Dothiorella_vidmadera',
             'Quambalaria_cyanescens',
             'Entoleuca_unidentified',
             'Asteroma_CCL060',
             'Asteroma_CCL068',
             'Saccharomyces_cerevisiae',
             'Cladophialophora_unidentified',
             'Candida_albicans',
             'Candida_metapsilosis',
             'Candida_orthopsilosis',
             'Candida_parapsilosis',
             'Candida_unidentified',
             'Kluyveromyces_marxianus',
             'Pichia_kudriavzevii',
             'Pichia_membranifaciens']
references = {}
for species in max_species:
    references[species] = species

In [4]:
taxonomy_file_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/taxonomy_file_wheat.csv')
species_file = {}
genus_file = {}
family_file = {}
order_file = {}
class_file = {}
phylum_file = {}

with open(taxonomy_file_fn, 'r') as fh:
        for line in fh:
            species_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]
            genus_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[5].split('__')[1].split('\n')[0]
            family_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[4].split('__')[1].split('\n')[0]
            order_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[3].split('__')[1].split('\n')[0]
            class_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[2].split('__')[1].split('\n')[0]
            phylum_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[1].split('__')[1].split('\n')[0]

In [144]:
species_file

{'puccinia_striiformis': 'Puccinia_striiformis',
 'zymoseptoria_tritici': 'Zymoseptoria_tritici',
 'pyrenophora_tritici-repentis': 'Pyrenophora_tritici-repentis',
 'fusarium_oxysporum': 'Fusarium_oxysporum',
 'tuber_brumale': 'Tuber_brumale',
 'cortinarius_globuliformis': 'Cortinarius_globuliformis',
 'aspergillus_niger': 'Aspergillus_niger',
 'clavispora_lusitaniae': 'Clavispora_lusitaniae',
 'kluyveromyces_unidentified': 'Kluyveromyces_unidentified',
 'penicillium_chrysogenum': 'Penicillium_chrysogenum',
 'rhodotorula_mucilaginosa': 'Rhodotorula_mucilaginosa',
 'scedosporium_boydii': 'Scedosporium_boydii',
 'blastobotrys_proliferans': 'Blastobotrys_proliferans',
 'debaryomyces_unidentified': 'Debaryomyces_unidentified',
 'galactomyces_geotrichum': 'Galactomyces_geotrichum',
 'kodamaea_ohmeri': 'Kodamaea_ohmeri',
 'meyerozyma_guilliermondii': 'Meyerozyma_guilliermondii',
 'wickerhamomyces_anomalus': 'Wickerhamomyces_anomalus',
 'yamadazyma_mexicana': 'Yamadazyma_mexicana',
 'yamadazym

In [5]:
mock_community = ['barcode01','barcode02',
                 'barcode05']

In [6]:
def subsamplereads(in_fn, out_fn, n_reads):
    command = F'reformat.sh samplereadstarget={n_reads} in={in_fn} out={out_fn}'
    out = subprocess.getstatusoutput(command)
    if out[0] == 0:
        print(F":)Completed {command}\n")
    else:
        print(F":(check one {command}!!\n")

In [7]:
n_reads = 2000

In [8]:
OUT_DIR = os.path.abspath('../../analysis/Mapping/wheat')
if not os.path.exists(OUT_DIR):
    os.mkdir(OUT_DIR)
MC_READ_DIR = os.path.join(OUT_DIR, 'MC_READS')
if not os.path.exists(MC_READ_DIR):
    os.mkdir(MC_READ_DIR)
sub_db_fn = os.path.join('../../analysis/Mapping/gsref.subdb.fasta')
new_db_fn = os.path.join('../../analysis/Mapping/gsref.db.fasta')

In [9]:
wheat_ref_df.columns

Index(['Unnamed: 0', 'species', 'genus', 'family', 'order', 'class', 'phylum',
       'kingdom', '# raw reads', '# reads after homology filtering',
       '# reads after length filtering', '# for use', 'path to raw reads',
       'path to homology filtering', 'path to length filtering',
       'path for use'],
      dtype='object')

In [10]:
fn_subsampling = {}
for x in mock_community:
    print(x)
    fn_subsampling[x] = wheat_ref_df[wheat_ref_df['Unnamed: 0'] == x]['path for use'].tolist()[0]
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])
fn_subsampling

barcode01
barcode02
barcode05


{'barcode01': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode01/length_restricted_reads.fasta',
 'barcode02': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode02/length_restricted_reads.fasta',
 'barcode05': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode05/length_restricted_reads.fasta'}

In [11]:
sub_reads_fn = {}
for key, value in fn_subsampling.items():
    print(key)
    print(value)
    species = key
    in_fn = value
    out_fn = os.path.join(MC_READ_DIR, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn

barcode01
/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode01/length_restricted_reads.fasta
:(check one reformat.sh samplereadstarget=2000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode01/length_restricted_reads.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode01.2000.fasta!!

barcode02
/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode02/length_restricted_reads.fasta
:(check one reformat.sh samplereadstarget=2000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode02/length_restricted_reads.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode02.2000.fasta!!

barcode05
/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode05/length_restricted_reads.fasta
:(check one reformat.sh samplereadstarget=2000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode05/length_restricted_reads.fasta ou

### Map with minimap against both databases

In [12]:
def minimapmapping(fasta_fn, ref_fn, out_fn, threads):
    command = F"minimap2 -x map-ont -t {threads} {ref_fn} {fasta_fn} -o {out_fn}"
    out = subprocess.getstatusoutput(command)
    if out[0] == 0:
        print(F":)Completed {command}\n")
    else:
        print(F":(check one {command}!!\n")

In [13]:
dbases_fn = {}
for x in [sub_db_fn, new_db_fn]:
    print(x)
    dbases_fn[x] = os.path.join(OUT_DIR, os.path.basename(x).replace('.fasta', '').replace('.','_'))
    if not os.path.exists(dbases_fn[x]):
        os.mkdir(dbases_fn[x])
dbases_fn

../../analysis/Mapping/gsref.subdb.fasta
../../analysis/Mapping/gsref.db.fasta


{'../../analysis/Mapping/gsref.subdb.fasta': '/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_subdb',
 '../../analysis/Mapping/gsref.db.fasta': '/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_db'}

In [14]:
db_fn = sub_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    tmp_out = dbases_fn[db_fn]
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(tmp_out, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode01.2000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_subdb/gsref.subdb.barcode01.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode02.2000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_subdb/gsref.subdb.barcode02.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode05.2000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_subdb/gsref.subdb.barcode05.minimap2.paf



In [15]:
db_fn = new_db_fn
new_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    tmp_out = dbases_fn[db_fn]
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(tmp_out, F"{db_name}.{species}.minimap2.paf")
    new_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode01.2000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_db/gsref.db.barcode01.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode02.2000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_db/gsref.db.barcode02.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode05.2000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_db/gsref.db.barcode05.minimap2.paf



### Look at mapping results

In [16]:
def mapping_results_v2(fn, species,expected_species, tax_rank, tax_rank_file):
    import numpy as np
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
    hit_series = pd.Series(sub_df.groupby('tname')['mquality'].count().tolist()/sub_df.groupby('tname')['mquality'].count().sum(),
                      sub_df.groupby('tname')['mquality'].count().index)
    if tax_rank == 's' or tax_rank == 'species':
        new_new_hit_series = hit_series
        new_new_hit_series.sort_values(ascending=False, inplace=True)
    else:
        new_hit_series = pd.Series()
        new_new_hit_series = pd.Series()
        for index in hit_series.index.unique():
            new_hit_series.at[index] = data=np.sum(hit_series[index])
        new_hit_series = new_hit_series.rename(tax_rank_file, axis='index')
        for index in new_hit_series.index.unique():
            new_new_hit_series.at[index] = data=np.sum(new_hit_series[index])
        new_new_hit_series.sort_values(ascending=False, inplace=True)
    print(sub_df.qseqid.unique().shape == tmp_df.qseqid.unique().shape)
    print('##########\n')
    print(F"This was the sample type expected: {tax_rank_file[expected_species[species]]}\n")
    print(F"These are the results:")
    print(new_new_hit_series,'\n')
    new_new_hit_series.to_json('/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/custom_results/%s_%s.json' % (tax_rank,expected_species[species]))

In [17]:
expected_rank_s = {}
expected_rank_g = {}
expected_rank_f = {}
expected_rank_o = {}
expected_rank_c = {}
expected_rank_p = {}

for column, row in wheat_ref_df.iterrows():
    expected_rank_s[row['Unnamed: 0']] = row['species']
    expected_rank_g[row['Unnamed: 0']] = row['genus']
    expected_rank_f[row['Unnamed: 0']] = row['family']
    expected_rank_o[row['Unnamed: 0']] = row['order']
    expected_rank_c[row['Unnamed: 0']] = row['class']
    expected_rank_p[row['Unnamed: 0']] = row['phylum']

In [18]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 's', species_file)

True
##########

This was the sample type expected: Puccinia_striiformis

These are the results:
tname
cryptococcus_zero                0.569652
blastobotrys_proliferans         0.208458
cortinarius_globuliformis        0.053234
saccharomyces_cerevisiae         0.023881
rhodotorula_mucilaginosa         0.015920
meyerozyma_guilliermondii        0.011940
dothiorella_vidmadera            0.010945
candida_albicans                 0.010448
candida_unidentified             0.010448
entoleuca_unidentified           0.007960
zymoseptoria_tritici             0.006965
aspergillus_flavus               0.006965
wickerhamomyces_anomalus         0.006468
puccinia_striiformis             0.005970
quambalaria_cyanescens           0.005970
cladophialophora_unidentified    0.004478
tuber_brumale                    0.003483
candida_metapsilosis             0.003483
pyrenophora_tritici-repentis     0.003483
clavispora_lusitaniae            0.003483
oculimacula_yallundae-ccl031     0.002488
oculimacula_yal

In [19]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 'g', genus_file)

True
##########

This was the sample type expected: Puccinia

These are the results:
Cryptococcus        0.569652
Blastobotrys        0.208458
Cortinarius         0.053234
Candida             0.027363
Saccharomyces       0.023881
Rhodotorula         0.015920
Meyerozyma          0.011940
Dothiorella         0.010945
Aspergillus         0.008955
Entoleuca           0.007960
Zymoseptoria        0.006965
Wickerhamomyces     0.006468
Puccinia            0.005970
Quambalaria         0.005970
Oculimacula         0.004975
Cladophialophora    0.004478
Kluyveromyces       0.003483
Pyrenophora         0.003483
Tuber               0.003483
Clavispora          0.003483
Yamadazyma          0.002985
Debaryomyces        0.002488
Pichia              0.001493
Yarrowia            0.001493
Penicillium         0.001493
Diaporthe           0.000995
Fusarium            0.000498
Zygoascus           0.000498
Asteroma            0.000498
Kodamaea            0.000498
dtype: float64 

True
##########

This was th

In [20]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 'f', family_file)

True
##########

This was the sample type expected: Pucciniaceae

These are the results:
Tremellaceae           0.569652
Trichomonascaceae      0.208955
Saccharomycetaceae     0.054726
Cortinariaceae         0.053234
Debaryomycetaceae      0.017413
Sporidiobolaceae       0.015920
Botryosphaeriaceae     0.010945
Aspergillaceae         0.010448
Xylariaceae            0.007960
Mycosphaerellaceae     0.006965
Phaffomycetaceae       0.006468
Pucciniaceae           0.005970
Quambalariaceae        0.005970
Dermateaceae           0.004975
Herpotrichiellaceae    0.004478
Metschnikowiaceae      0.003980
Pleosporaceae          0.003483
Tuberaceae             0.003483
Dipodascaceae          0.001493
Pichiaceae             0.001493
Diaporthaceae          0.000995
Nectriaceae            0.000498
Gnomoniaceae           0.000498
dtype: float64 

True
##########

This was the sample type expected: Mycosphaerellaceae

These are the results:
Mycosphaerellaceae     0.520916
Tremellaceae           0.165339

In [21]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 'o', order_file)

True
##########

This was the sample type expected: Pucciniales

These are the results:
Tremellales          0.569652
Saccharomycetales    0.294527
Agaricales           0.053234
Sporidiobolales      0.015920
Botryosphaeriales    0.010945
Eurotiales           0.010448
Xylariales           0.007960
Capnodiales          0.006965
Microstromatales     0.005970
Pucciniales          0.005970
Helotiales           0.004975
Chaetothyriales      0.004478
Pezizales            0.003483
Pleosporales         0.003483
Diaporthales         0.001493
Hypocreales          0.000498
dtype: float64 

True
##########

This was the sample type expected: Capnodiales

These are the results:
Capnodiales          0.520916
Tremellales          0.165339
Saccharomycetales    0.146414
Agaricales           0.046813
Pleosporales         0.045319
Eurotiales           0.027390
Chaetothyriales      0.015936
Botryosphaeriales    0.010956
Microstromatales     0.006474
Sporidiobolales      0.005976
Helotiales           0.0034

In [22]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 'c', class_file)

True
##########

This was the sample type expected: Pucciniomycetes

These are the results:
Tremellomycetes       0.569652
Saccharomycetes       0.294527
Agaricomycetes        0.053234
Dothideomycetes       0.021393
Microbotryomycetes    0.015920
Eurotiomycetes        0.014925
Sordariomycetes       0.009950
Exobasidiomycetes     0.005970
Pucciniomycetes       0.005970
Leotiomycetes         0.004975
Pezizomycetes         0.003483
dtype: float64 

True
##########

This was the sample type expected: Dothideomycetes

These are the results:
Dothideomycetes       0.577191
Tremellomycetes       0.165339
Saccharomycetes       0.146414
Agaricomycetes        0.046813
Eurotiomycetes        0.043327
Exobasidiomycetes     0.006474
Microbotryomycetes    0.005976
Leotiomycetes         0.003486
Sordariomycetes       0.003486
Pezizomycetes         0.000996
Pucciniomycetes       0.000498
dtype: float64 

True
##########

This was the sample type expected: Dothideomycetes

These are the results:
Dothideo

In [23]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 'p', phylum_file)

True
##########

This was the sample type expected: Basidiomycota

These are the results:
Basidiomycota    0.650746
Ascomycota       0.349254
dtype: float64 

True
##########

This was the sample type expected: Ascomycota

These are the results:
Ascomycota       0.7749
Basidiomycota    0.2251
dtype: float64 

True
##########

This was the sample type expected: Ascomycota

These are the results:
Ascomycota       0.681368
Basidiomycota    0.318632
dtype: float64 



## Apply to mock_database for qiime and consensus respectively

In [120]:
from Bio import SeqIO
import os
import random
import subprocess
import pandas as pd

INPUT_BASEDIR = os.path.abspath('/media/MassStorage/tmp/TE/honours')
subsampling_dir = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/subsample_reads')
mapping_dir = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/qiime_results')

mock_reference_dataframe_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/mock_reference_dataframe.csv')
mock_max_custom_database_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/database/mock_database_labelled.fasta')
mock_taxonomy_file_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/mock_taxonomy_file_qiime.csv')

qiime_db_fn = os.path.abspath('../../analysis/qiime2/db/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta')
qiime_tax_fn = os.path.abspath('../../analysis/qiime2/db/sh_taxonomy_qiime_ver8_dynamic_02.02.2019.txt')
threads = 10

mock_ref_df = pd.read_csv(mock_reference_dataframe_fn)
mock_ref_df

Unnamed: 0.1,Unnamed: 0,species,genus,family,order,class,phylum,kingdom,# raw reads,# reads after homology filtering,# reads after length filtering,# for use,path to raw reads,path to homology filtering,path to length filtering,path for use
0,20171212_FAH18688/barcode03,diaporthe_ccl067,diaporthe,diaporthaceae,diaporthales,sordariomycetes,ascomycota,fungi,172601,32847,30504,30504.0,analysis/Concatenated/20171212_FAH18688/barcod...,analysis/Python_Processing/20171212_FAH18688/b...,analysis/Length_Filtered/20171212_FAH18688/bar...,analysis/Length_Filtered/20171212_FAH18688/bar...
1,20171212_FAH18688/barcode04,diaporthe_unidentified,diaporthe,diaporthaceae,diaporthales,sordariomycetes,ascomycota,fungi,198500,29833,27941,27941.0,analysis/Concatenated/20171212_FAH18688/barcod...,analysis/Python_Processing/20171212_FAH18688/b...,analysis/Length_Filtered/20171212_FAH18688/bar...,analysis/Length_Filtered/20171212_FAH18688/bar...
2,20171103_FAH15473/barcode01,puccinia_striiformis,puccinia,pucciniaceae,pucciniales,pucciniomycetes,basidiomycota,fungi,272465,122080,113337,113337.0,analysis/Concatenated/20171103_FAH15473/barcod...,analysis/Python_Processing/20171103_FAH15473/b...,analysis/Length_Filtered/20171103_FAH15473/bar...,analysis/Length_Filtered/20171103_FAH15473/bar...
3,20171103_FAH15473/barcode02,zymoseptoria_tritici,zymoseptoria,mycosphaerellaceae,capnodiales,dothideomycetes,ascomycota,fungi,413127,143363,133089,133089.0,analysis/Concatenated/20171103_FAH15473/barcod...,analysis/Python_Processing/20171103_FAH15473/b...,analysis/Length_Filtered/20171103_FAH15473/bar...,analysis/Length_Filtered/20171103_FAH15473/bar...
4,20171103_FAH15473/barcode03,pyrenophora_tritici-repentis,pyrenophora,pleosporaceae,pleosporales,dothideomycetes,ascomycota,fungi,260896,97584,90015,90015.0,analysis/Concatenated/20171103_FAH15473/barcod...,analysis/Python_Processing/20171103_FAH15473/b...,analysis/Length_Filtered/20171103_FAH15473/bar...,analysis/Length_Filtered/20171103_FAH15473/bar...
5,20171103_FAH15473/barcode07,aspergillus_niger,aspergillus,aspergillaceae,eurotiales,eurotiomycetes,ascomycota,fungi,171615,65065,59406,59406.0,analysis/Concatenated/20171103_FAH15473/barcod...,analysis/Python_Processing/20171103_FAH15473/b...,analysis/Length_Filtered/20171103_FAH15473/bar...,analysis/Length_Filtered/20171103_FAH15473/bar...
6,20171103_FAH15473/barcode11,rhodotorula_mucilaginosa,rhodotorula,sporidiobolaceae,sporidiobolales,microbotryomycetes,basidiomycota,fungi,318405,127801,117801,117801.0,analysis/Concatenated/20171103_FAH15473/barcod...,analysis/Python_Processing/20171103_FAH15473/b...,analysis/Length_Filtered/20171103_FAH15473/bar...,analysis/Length_Filtered/20171103_FAH15473/bar...
7,20171103_FAH15473/barcode12,scedosporium_boydii,scedosporium,microascaceae,microascales,sordariomycetes,ascomycota,fungi,331947,102481,93723,93723.0,analysis/Concatenated/20171103_FAH15473/barcod...,analysis/Python_Processing/20171103_FAH15473/b...,analysis/Length_Filtered/20171103_FAH15473/bar...,analysis/Length_Filtered/20171103_FAH15473/bar...
8,20171207_FAH18654/barcode12,aspergillus_flavus,aspergillus,aspergillaceae,eurotiales,eurotiomycetes,ascomycota,fungi,125014,54061,51340,51340.0,analysis/Concatenated/20171207_FAH18654/barcod...,analysis/Python_Processing/20171207_FAH18654/b...,analysis/Length_Filtered/20171207_FAH18654/bar...,analysis/Length_Filtered/20171207_FAH18654/bar...
9,20180108_FAH18647/barcode01,saccharomyces_cerevisiae,saccharomyces,saccharomycetaceae,saccharomycetales,saccharomycetes,ascomycota,fungi,96837,33025,30260,30260.0,analysis/Concatenated/20180108_FAH18647/barcod...,analysis/Python_Processing/20180108_FAH18647/b...,analysis/Length_Filtered/20180108_FAH18647/bar...,analysis/Length_Filtered/20180108_FAH18647/bar...


##### Qiime Database

In [121]:
import json
from collections import OrderedDict

def assign_taxranks_results(mapping_df, tax_fn, ref_df_fn = False):
    """
    This function assigns the taxonomic ranks for each hit in the mapping results dataframe.
    It takes a mapping_df, taxonomy assignment file, and if required a reference dataframe filename.
    Returns the mapping dataframe with assignment. 
    """
    for tname in mapping_df.index:
        if ref_df_fn:
            tmp_taxfileid = getquery_taxfileid(ref_df_fn, tname)
        else:
            tmp_taxfileid = tname
        tmp_tax_dict = get_taxid_dict(tax_fn, tmp_taxfileid)
        for key, value in tmp_tax_dict.items():
            mapping_df.loc[tname, key] = value
    return mapping_df

def get_accuracy_dict(mapping_df, query_tax_dict):
    """
    Summarises the mapping accuracy of the mapping results at all taxonomic ranks.
    Takes the mapping_df with taxnomonic assignments and a taxnomic dictionary of the known query.
    Returns an accuracy dictionary for each taxnomic rank ['k', 'p', 'c', 'o', 'f', 'g', 's']. 
    Right now this function takes a qiime tax 
    """
    accuracy_dict = OrderedDict()
    total_count = mapping_df['count'].sum()
    for tax_rank in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        tmps_df = pd.DataFrame(data=None)
        if tax_rank == 's':
            for index, row in mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]].iterrows():
                if row['s'] == 'unidentified' and row['g'] != query_tax_dict['g']:
                    mapping_df.drop(index, axis=0, inplace=True)
                else:
                    continue
            hit_count = mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]]['count'].sum()
        else:
            hit_count = mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]]['count'].sum()
        accuracy_dict[tax_rank] = hit_count/total_count
    return accuracy_dict

def getquery_taxfileid(refdf_fn, species):
    """
    Takes the reference dataframe filename and the species name.
    Returns the taxfileid, which is the date/flowcellid (column 0 value) of the ref_df.
    """
    ref_df = pd.read_csv(refdf_fn)
    return ref_df[ref_df.species == species].iloc[:,0].values[0]

def get_taxid_dict(taxid_fn, taxfileid):
    """
    Takes a taxonomy assignment file filename in the Qiime format and a taxonomic identifier.
    Returns the a dictionary with the taxonomic assignment at each rank.
    """
    tax_dict = {}
    with open(taxid_fn, 'r') as fh:
        for line in fh:
            if line.startswith(taxfileid):
                taxrankids = line.rstrip().split('\t')[1].split(';')
                for taxrank in taxrankids:
                    tax_dict[taxrank.split('__')[0]] = taxrank.split('__')[1]
    return tax_dict

def mapping_results(fn, species):
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
    hit_series = pd.Series(sub_df.groupby('tname')['mquality'].count().tolist()/sub_df.groupby('tname')['mquality'].count().sum(),
                      sub_df.groupby('tname')['mquality'].count().index)
    hit_series.sort_values(ascending=False, inplace=True)
    print(sub_df.qseqid.unique().shape == tmp_df.qseqid.unique().shape)
    print('##########\n')
    print(F"This was the query species: {species}\n")
    print(F"These are the results:")
    print(hit_series,'\n')
    hit_series.to_json('/media/MassStorage/tmp/TE/honours/analysis/Mapping/custom_results/%s.json' % species)

def minimapmapping(fasta_fn, ref_fn, out_fn, threads):
    command = F"minimap2 -x map-ont -t {threads} {ref_fn} {fasta_fn} -o {out_fn}"
    out = subprocess.getstatusoutput(command)

def pull_mapping_results_v3(fn):
    """
    Takes a minimap2 paf and reads it in with the first 12 columns. Ignores the rest.
    Filters for each read the best hit on mquality first taking the highest value.
    Filters for each read by the number of nmatches in the second step.
    Returns a dataframe that has the tnames as index and the counts of hits as column 'count'.
    The dataframe has also the taxrank columns ['k', 'p', 'c', 'o', 'f', 'g', 's'] that are all False to start with.
    """
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
#     sub_df = sub_df[sub_df['nmatch'] == sub_df.groupby('qseqid')['nmatch'].transform(max)].reset_index(drop=True)
    hit_df = pd.DataFrame(sub_df.groupby('tname')['cscore'].count().tolist(), sub_df.groupby('tname')['cscore'].count().index, columns=['count'])
    hit_df.sort_values(by='count', ascending=False, inplace=True)
    for key in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        hit_df[key] = False
        tmp_df[key] = False
    return hit_df, tmp_df
    
def subsamplereads(in_fn, out_fn, n_reads):
    command = F'reformat.sh samplereadstarget={n_reads} in={in_fn} out={out_fn}'
    out = subprocess.getstatusoutput(command)

test_species_list = []
for entry in mock_ref_df.species.tolist():
#     if entry[-7:] != '-ccl031' and entry[-7:] != '-ccl029':
#         test_species_list.append(entry)
#     else:
#         test_species_list.append(entry[:-7])
#         print(entry[:-7])
    test_species_list.append(entry)
    
for test_species in test_species_list:
    
    print(test_species)
    
    #subsample tests species
    fn_subsampling = {}
    test_species = [test_species]
    for x in test_species:
#         print((mock_ref_df['species'] == x))
        fn_subsampling[x] = (mock_ref_df[(mock_ref_df['species'] == x) & (mock_ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
        fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])

    sub_reads_fn = {}
    n_reads = 200
    for key, value in fn_subsampling.items():
        species = key
        in_fn = value
        out_fn = os.path.join(subsampling_dir, F'{species}.{n_reads}.fasta')
        subsamplereads(in_fn, out_fn, n_reads)
        sub_reads_fn[species] = out_fn
        
    ###Map the reads
    db_fn = qiime_db_fn
    sub_db_mapping_fn = {}
    for species, fasta_fn in sub_reads_fn.items():
        db_name = os.path.basename(db_fn).replace('.fasta', '')
        out_fn = os.path.join(mapping_dir, F"{db_name}.{species}.minimap2.paf")
        sub_db_mapping_fn[species] = out_fn
        minimapmapping(fasta_fn, db_fn, out_fn, threads)
        
    ###Test out the summary results statistic for a single mapping result
    species = test_species[0]
    mapping_results , full_results_df = pull_mapping_results_v3(sub_db_mapping_fn[species])
    mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
    taxfileid = getquery_taxfileid(mock_reference_dataframe_fn, species)
    query_tax_dict = get_taxid_dict(mock_taxonomy_file_fn, taxfileid)
    
    sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)
        
    print(json.dumps(sensitivity_dict, indent=1))
    print('\n')
    with open('/media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/qiime_results/%s.json' % species, 'w+') as fp:
        json.dump(sensitivity_dict, fp)

diaporthe_ccl067
{
 "k": 1.0,
 "p": 0.9724770642201835,
 "c": 0.7844036697247706,
 "o": 0.4954128440366973,
 "f": 0.0,
 "g": 0.0,
 "s": 0.0
}


diaporthe_unidentified
{
 "k": 1.0,
 "p": 0.9912663755458515,
 "c": 0.7030567685589519,
 "o": 0.27510917030567683,
 "f": 0.25327510917030566,
 "g": 0.25327510917030566,
 "s": 0.0
}


puccinia_striiformis
{
 "k": 1.0,
 "p": 0.9748743718592965,
 "c": 0.8894472361809045,
 "o": 0.8894472361809045,
 "f": 0.8894472361809045,
 "g": 0.8793969849246231,
 "s": 0.8492462311557789
}


zymoseptoria_tritici
{
 "k": 1.0,
 "p": 0.8709677419354839,
 "c": 0.3778801843317972,
 "o": 0.2350230414746544,
 "f": 0.2350230414746544,
 "g": 0.22119815668202766,
 "s": 0.0
}


pyrenophora_tritici-repentis
{
 "k": 1.0,
 "p": 0.8293838862559242,
 "c": 0.5829383886255924,
 "o": 0.5687203791469194,
 "f": 0.3933649289099526,
 "g": 0.33649289099526064,
 "s": 0.3127962085308057
}


aspergillus_niger
{
 "k": 1.0,
 "p": 0.9489795918367347,
 "c": 0.5782312925170068,
 "o": 0.57823129

##### Custom Database

In [122]:
### list of species in the max database
max_species = ['Puccinia_striiformis',
             'Zymoseptoria_tritici',
             'Pyrenophora_tritici-repentis',
             'Fusarium_oxysporum',
             'Tuber_brumale',
             'Cortinarius_globuliformis',
             'Aspergillus_niger',
             'Clavispora_lusitaniae',
             'Kluyveromyces_unidentified',
             'Penicillium_chrysogenum',
             'Rhodotorula_mucilaginosa',
             'Scedosporium_boydii',
             'Blastobotrys_proliferans',
             'Debaryomyces_unidentified',
             'Galactomyces_geotrichum',
             'Kodamaea_ohmeri',
             'Meyerozyma_guilliermondii',
             'Wickerhamomyces_anomalus',
             'Yamadazyma_mexicana',
             'Yamadazyma_scolyti',
             'Yarrowia_lipolytica',
             'Zygoascus_hellenicus',
             'Aspergillus_flavus',
             'Cryptococcus_zero',
             'Aspergillus_unidentified',
             'Diaporthe_CCL067',
             'Diaporthe_unidentified',
             'Oculimacula_yallundae-CCL031',
             'Oculimacula_yallundae-CCL029',
             'Dothiorella_vidmadera',
             'Quambalaria_cyanescens',
             'Entoleuca_unidentified',
             'Asteroma_CCL060',
             'Asteroma_CCL068',
             'Saccharomyces_cerevisiae',
             'Cladophialophora_unidentified',
             'Candida_albicans',
             'Candida_metapsilosis',
             'Candida_orthopsilosis',
             'Candida_parapsilosis',
             'Candida_unidentified',
             'Kluyveromyces_marxianus',
             'Pichia_kudriavzevii',
             'Pichia_membranifaciens']
references = {}
for species in max_species:
    references[species] = species

In [123]:
taxonomy_file_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/taxonomy_file_mock.csv')
species_file = {}
genus_file = {}
family_file = {}
order_file = {}
class_file = {}
phylum_file = {}

with open(taxonomy_file_fn, 'r') as fh:
        for line in fh:
            species_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]
            genus_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[5].split('__')[1].split('\n')[0]
            family_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[4].split('__')[1].split('\n')[0]
            order_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[3].split('__')[1].split('\n')[0]
            class_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[2].split('__')[1].split('\n')[0]
            phylum_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[1].split('__')[1].split('\n')[0]

In [124]:
taxonomy_file_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/taxonomy_file_mock.csv')
species_file = {}
genus_file = {}
family_file = {}
order_file = {}

with open(taxonomy_file_fn, 'r') as fh:
        for line in fh:
            species_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]
            genus_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[5].split('__')[1].split('\n')[0]
            family_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[4].split('__')[1].split('\n')[0]
            order_file[references[line.split('\t')[1].split(';')[6].split('__')[1].split('\n')[0]].lower()] = line.split('\t')[1].split(';')[3].split('__')[1].split('\n')[0]

In [125]:
mock_community = ['aspergillus_flavus',
                  'aspergillus_niger',
                  'candida_albicans',
                  'candida_metapsilosis',
                  'candida_orthopsilosis',
                  'diaporthe_unidentified',
                  'diaporthe_ccl067',
                  'puccinia_striiformis',
                  'pyrenophora_tritici-repentis',
                  'rhodotorula_mucilaginosa',
                  'saccharomyces_cerevisiae',
                  'scedosporium_boydii',
                  'zymoseptoria_tritici']

In [126]:
def subsamplereads(in_fn, out_fn, n_reads):
    command = F'reformat.sh samplereadstarget={n_reads} in={in_fn} out={out_fn}'
    out = subprocess.getstatusoutput(command)
    if out[0] == 0:
        print(F":)Completed {command}\n")
    else:
        print(F":(check one {command}!!\n")

In [127]:
n_reads = 200

In [128]:
OUT_DIR = os.path.abspath('../../analysis/Mapping/mock')
if not os.path.exists(OUT_DIR):
    os.mkdir(OUT_DIR)
MC_READ_DIR = os.path.join(OUT_DIR, 'subsample_reads')
if not os.path.exists(MC_READ_DIR):
    os.mkdir(MC_READ_DIR)
sub_db_fn = os.path.join('../../analysis/Mapping/gsref.subdb.fasta')
new_db_fn = os.path.join('../../analysis/Mapping/gsref.db.fasta')

In [129]:
mock_ref_df.columns

Index(['Unnamed: 0', 'species', 'genus', 'family', 'order', 'class', 'phylum',
       'kingdom', '# raw reads', '# reads after homology filtering',
       '# reads after length filtering', '# for use', 'path to raw reads',
       'path to homology filtering', 'path to length filtering',
       'path for use'],
      dtype='object')

In [130]:
fn_subsampling = {}
for x in mock_community:
    print(x)
    fn_subsampling[x] = mock_ref_df[mock_ref_df['species'] == x]['path for use'].tolist()[0]
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])
fn_subsampling

aspergillus_flavus
aspergillus_niger
candida_albicans
candida_metapsilosis
candida_orthopsilosis
diaporthe_unidentified
diaporthe_ccl067
puccinia_striiformis
pyrenophora_tritici-repentis
rhodotorula_mucilaginosa
saccharomyces_cerevisiae
scedosporium_boydii
zymoseptoria_tritici


{'aspergillus_flavus': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171207_FAH18654/barcode12/length_restricted_for_use.fasta',
 'aspergillus_niger': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode07/length_restricted_for_use.fasta',
 'candida_albicans': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20180108_FAH18647/barcode03/length_restricted_for_use.fasta',
 'candida_metapsilosis': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20180108_FAH18647/barcode04/length_restricted_for_use.fasta',
 'candida_orthopsilosis': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20180108_FAH18647/barcode05/length_restricted_for_use.fasta',
 'diaporthe_unidentified': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171212_FAH18688/barcode04/length_restricted_for_use.fasta',
 'diaporthe_ccl067': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171212_FAH18688/barcode03/length_restrict

In [131]:
sub_reads_fn = {}
for key, value in fn_subsampling.items():
    print(key)
    print(value)
    species = key
    in_fn = value
    out_fn = os.path.join(MC_READ_DIR, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn

aspergillus_flavus
/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171207_FAH18654/barcode12/length_restricted_for_use.fasta
:(check one reformat.sh samplereadstarget=200 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171207_FAH18654/barcode12/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/subsample_reads/aspergillus_flavus.200.fasta!!

aspergillus_niger
/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode07/length_restricted_for_use.fasta
:(check one reformat.sh samplereadstarget=200 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode07/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/subsample_reads/aspergillus_niger.200.fasta!!

candida_albicans
/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20180108_FAH18647/barcode03/length_restricted_for_use.fasta
:(check one reformat.sh samplereadstar

### Map with minimap against both databases

In [132]:
def minimapmapping(fasta_fn, ref_fn, out_fn, threads):
    command = F"minimap2 -x map-ont -t {threads} {ref_fn} {fasta_fn} -o {out_fn}"
    out = subprocess.getstatusoutput(command)
    if out[0] == 0:
        print(F":)Completed {command}\n")
    else:
        print(F":(check one {command}!!\n")

In [133]:
dbases_fn = {}
for x in [sub_db_fn, new_db_fn]:
    print(x)
    dbases_fn[x] = os.path.join(OUT_DIR, os.path.basename(x).replace('.fasta', '').replace('.','_'))
    if not os.path.exists(dbases_fn[x]):
        os.mkdir(dbases_fn[x])
dbases_fn

../../analysis/Mapping/gsref.subdb.fasta
../../analysis/Mapping/gsref.db.fasta


{'../../analysis/Mapping/gsref.subdb.fasta': '/media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/gsref_subdb',
 '../../analysis/Mapping/gsref.db.fasta': '/media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/gsref_db'}

In [134]:
db_fn = sub_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    tmp_out = dbases_fn[db_fn]
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(tmp_out, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/subsample_reads/aspergillus_flavus.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/gsref_subdb/gsref.subdb.aspergillus_flavus.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/subsample_reads/aspergillus_niger.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/gsref_subdb/gsref.subdb.aspergillus_niger.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/subsample_reads/candida_albicans.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/gsref_subdb/gsref.subdb.candida_albicans.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analy

In [135]:
db_fn = new_db_fn
new_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    tmp_out = dbases_fn[db_fn]
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(tmp_out, F"{db_name}.{species}.minimap2.paf")
    new_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/subsample_reads/aspergillus_flavus.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/gsref_db/gsref.db.aspergillus_flavus.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/subsample_reads/aspergillus_niger.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/gsref_db/gsref.db.aspergillus_niger.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/subsample_reads/candida_albicans.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/gsref_db/gsref.db.candida_albicans.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/subsample_rea

### Look at mapping results

In [136]:
def mapping_results_v2(fn, species,expected_species, tax_rank, tax_rank_file):
    import numpy as np
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
    hit_series = pd.Series(sub_df.groupby('tname')['mquality'].count().tolist()/sub_df.groupby('tname')['mquality'].count().sum(),
                      sub_df.groupby('tname')['mquality'].count().index)
    if tax_rank == 's' or tax_rank == 'species':
        new_new_hit_series = hit_series
        new_new_hit_series.sort_values(ascending=False, inplace=True)
    else:
        new_hit_series = pd.Series()
        new_new_hit_series = pd.Series()
        for index in hit_series.index.unique():
            new_hit_series.at[index] = data=np.sum(hit_series[index])
        new_hit_series = new_hit_series.rename(tax_rank_file, axis='index')
        for index in new_hit_series.index.unique():
            new_new_hit_series.at[index] = data=np.sum(new_hit_series[index])
        new_new_hit_series.sort_values(ascending=False, inplace=True)
    print(sub_df.qseqid.unique().shape == tmp_df.qseqid.unique().shape)
    print('##########\n')
    print(F"This was the sample type expected: {tax_rank_file[expected_species[species]]}\n")
    print(F"These are the results:")
    print(new_new_hit_series,'\n')
    new_new_hit_series.to_json('/media/MassStorage/tmp/TE/honours/analysis/Mapping/mock/custom_results/%s_%s.json' % (tax_rank,expected_species[species]))

In [137]:
expected_rank_s = {}
expected_rank_g = {}
expected_rank_f = {}
expected_rank_o = {}
expected_rank_c = {}
expected_rank_p = {}

for column, row in mock_ref_df.iterrows():
    expected_rank_s[row['species']] = row['species']
    expected_rank_g[row['species']] = row['genus']
    expected_rank_f[row['species']] = row['family']
    expected_rank_o[row['species']] = row['order']
    expected_rank_c[row['species']] = row['class']
    expected_rank_p[row['species']] = row['phylum']

In [138]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    print(barcode)
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 's', species_file)

aspergillus_flavus
True
##########

This was the sample type expected: Aspergillus_flavus

These are the results:
tname
aspergillus_flavus    1.0
dtype: float64 

aspergillus_niger
True
##########

This was the sample type expected: Aspergillus_niger

These are the results:
tname
aspergillus_niger            0.763547
aspergillus_unidentified     0.211823
yamadazyma_mexicana          0.004926
dothiorella_vidmadera        0.004926
debaryomyces_unidentified    0.004926
candida_parapsilosis         0.004926
aspergillus_flavus           0.004926
dtype: float64 

candida_albicans
True
##########

This was the sample type expected: Candida_albicans

These are the results:
tname
candida_unidentified            0.450980
candida_albicans                0.377451
candida_orthopsilosis           0.088235
candida_metapsilosis            0.034314
candida_parapsilosis            0.019608
tuber_brumale                   0.004902
pyrenophora_tritici-repentis    0.004902
meyerozyma_guilliermondii       0

In [139]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 'g', genus_file)

True
##########

This was the sample type expected: Aspergillus

These are the results:
Aspergillus    1.0
dtype: float64 

True
##########

This was the sample type expected: Aspergillus

These are the results:
Aspergillus     0.980296
Yamadazyma      0.004926
Dothiorella     0.004926
Debaryomyces    0.004926
Candida         0.004926
dtype: float64 

True
##########

This was the sample type expected: Candida

These are the results:
Candida          0.970588
Tuber            0.004902
Pyrenophora      0.004902
Meyerozyma       0.004902
Kluyveromyces    0.004902
Galactomyces     0.004902
Diaporthe        0.004902
dtype: float64 

True
##########

This was the sample type expected: Candida

These are the results:
Candida      0.965854
Kodamaea     0.019512
Diaporthe    0.009756
Fusarium     0.004878
dtype: float64 

True
##########

This was the sample type expected: Candida

These are the results:
Candida        0.965347
Meyerozyma     0.019802
Oculimacula    0.009901
Entoleuca      0.0

In [140]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 'f', family_file)

True
##########

This was the sample type expected: Aspergillaceae

These are the results:
Aspergillaceae    1.0
dtype: float64 

True
##########

This was the sample type expected: Aspergillaceae

These are the results:
Aspergillaceae        0.980296
Debaryomycetaceae     0.009852
Botryosphaeriaceae    0.004926
Saccharomycetaceae    0.004926
dtype: float64 

True
##########

This was the sample type expected: Saccharomycetaceae

These are the results:
Saccharomycetaceae    0.975490
Tuberaceae            0.004902
Pleosporaceae         0.004902
Debaryomycetaceae     0.004902
Dipodascaceae         0.004902
Diaporthaceae         0.004902
dtype: float64 

True
##########

This was the sample type expected: Saccharomycetaceae

These are the results:
Saccharomycetaceae    0.965854
Metschnikowiaceae     0.019512
Diaporthaceae         0.009756
Nectriaceae           0.004878
dtype: float64 

True
##########

This was the sample type expected: Saccharomycetaceae

These are the results:
Saccharom

In [141]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 'o', order_file)

True
##########

This was the sample type expected: Eurotiales

These are the results:
Eurotiales    1.0
dtype: float64 

True
##########

This was the sample type expected: Eurotiales

These are the results:
Eurotiales           0.980296
Saccharomycetales    0.014778
Botryosphaeriales    0.004926
dtype: float64 

True
##########

This was the sample type expected: Saccharomycetales

These are the results:
Saccharomycetales    0.985294
Pezizales            0.004902
Pleosporales         0.004902
Diaporthales         0.004902
dtype: float64 

True
##########

This was the sample type expected: Saccharomycetales

These are the results:
Saccharomycetales    0.985366
Diaporthales         0.009756
Hypocreales          0.004878
dtype: float64 

True
##########

This was the sample type expected: Saccharomycetales

These are the results:
Saccharomycetales    0.985149
Helotiales           0.009901
Xylariales           0.004950
dtype: float64 

True
##########

This was the sample type expected:

In [142]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 'c', class_file)

True
##########

This was the sample type expected: Eurotiomycetes

These are the results:
Eurotiomycetes    1.0
dtype: float64 

True
##########

This was the sample type expected: Eurotiomycetes

These are the results:
Eurotiomycetes     0.980296
Saccharomycetes    0.014778
Dothideomycetes    0.004926
dtype: float64 

True
##########

This was the sample type expected: Saccharomycetes

These are the results:
Saccharomycetes    0.985294
Pezizomycetes      0.004902
Dothideomycetes    0.004902
Sordariomycetes    0.004902
dtype: float64 

True
##########

This was the sample type expected: Saccharomycetes

These are the results:
Saccharomycetes    0.985366
Sordariomycetes    0.014634
dtype: float64 

True
##########

This was the sample type expected: Saccharomycetes

These are the results:
Saccharomycetes    0.985149
Leotiomycetes      0.009901
Sordariomycetes    0.004950
dtype: float64 

True
##########

This was the sample type expected: Sordariomycetes

These are the results:
Sordari

In [143]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 'p', phylum_file)

True
##########

This was the sample type expected: Ascomycota

These are the results:
Ascomycota    1.0
dtype: float64 

True
##########

This was the sample type expected: Ascomycota

These are the results:
Ascomycota    1.0
dtype: float64 

True
##########

This was the sample type expected: Ascomycota

These are the results:
Ascomycota    1.0
dtype: float64 

True
##########

This was the sample type expected: Ascomycota

These are the results:
Ascomycota    1.0
dtype: float64 

True
##########

This was the sample type expected: Ascomycota

These are the results:
Ascomycota    1.0
dtype: float64 

True
##########

This was the sample type expected: Ascomycota

These are the results:
Ascomycota    1.0
dtype: float64 

True
##########

This was the sample type expected: Ascomycota

These are the results:
Ascomycota    1.0
dtype: float64 

True
##########

This was the sample type expected: Basidiomycota

These are the results:
Basidiomycota    0.995
Ascomycota       0.005
dtype: flo