In [1]:
# modules required for handling dataframes
import os
import pandas as pd
from ete3 import NCBITaxa 
ncbi = NCBITaxa()

In [2]:
def search_rank_output_name_append_column(df, staxid_column, rank_search):
    """Input df, staxid_column from same df and rank_search (a desired taxonomic rank 
    from each staxid's lineage), outputs taxonomic name corresponding to rank_search or 'Unclassified' if
    unavailable and appends to df row by row"""
    rank_list = []
    for read_index in range(0, len(staxid_column)):
        taxid = ''
        if ';' in str(staxid_column[read_index]):
            taxid = staxid_column[read_index].split(';')[0]
        else:
            taxid = staxid_column[read_index]
        
        taxid_lineage = ''
        taxid_lineage = ncbi.get_lineage(taxid)
        
        names = ''
        names = ncbi.get_taxid_translator(taxid_lineage)
        
        ranks = ''
        ranks = ncbi.get_rank(taxid_lineage) #Dict
        
        ranks2names = ''
        ranks2names = {ranks[k]:names[k] for k in names.keys() & ranks}
        
        if rank_search in ranks2names.keys():
            rank_list.append(ranks2names[rank_search])#if rank in dict, print name
        else:
            rank_list.append('Unclassified')
    df[rank_search] = rank_list
    
def trim_df_columns(df,column_list):
    df.drop(columns=column_list, inplace=True)
    
def unique_df(df):
    df_unique = df
    df_unique = df_unique[df_unique.taxid != 0]
    # This species is not in the ete3 database and doesn't have too much hits
    # Therefore I just removed it from the output file first. 
    # But certainly we can use a try function in the later step.
    df_unique = df_unique[df_unique.taxid != 2726947]
    df_unique = df_unique.reset_index(drop=True)
    return df_unique

In [3]:
# Define global variables.
sourcedir = '/home/yiheng/MinION_data/tavish_data/tavish_kraken_output' # the directory where all the documents of each sequencing run are stored.

In [4]:
os.listdir(sourcedir)

['puccinia_striiformis.1000.ncbiITSkraken_output',
 'candida_orthopsilosis.1000.ncbiITSkraken_output',
 'candida_albicans.1000.ncbiITSkraken_output',
 'aspergillus_niger.1000.ncbiITSkraken_output',
 'candida_metapsilosis.1000.ncbiITSkraken_output',
 'diaporthe_ccl067.1000.ncbiITSkraken_output',
 'aspergillus_flavus.1000.ncbiITSkraken_output',
 'pyrenophora_tritici-repentis.1000.ncbiITSkraken_output',
 'wheat_zymoseptoria_tritici.1000.ncbiITSkraken_output',
 'wheat_pyrenophora_tritici-repentis.1000.ncbiITSkraken_output',
 'diaporthe_unidentified.1000.ncbiITSkraken_output',
 'scedosporium_boydii.1000.ncbiITSkraken_output',
 'saccharomyces_cerevisiae.1000.ncbiITSkraken_output',
 'zymoseptoria_tritici.1000.ncbiITSkraken_output',
 'rhodotorula_mucilaginosa.1000.ncbiITSkraken_output',
 'wheat_puccinia_striiformis.1000.ncbiITSkraken_output']

In [5]:
species_list = [x.split('.')[0] for x in os.listdir(sourcedir)]

In [6]:
puccinia_striiformis_dir = os.path.join(sourcedir, 'puccinia_striiformis.1000.ncbiITSkraken_output')
candida_orthopsilosis_dir = os.path.join(sourcedir, 'candida_orthopsilosis.1000.ncbiITSkraken_output')
candida_albicans_dir = os.path.join(sourcedir, 'candida_albicans.1000.ncbiITSkraken_output')
aspergillus_niger_dir = os.path.join(sourcedir, 'aspergillus_niger.1000.ncbiITSkraken_output')
candida_metapsilosis_dir = os.path.join(sourcedir, 'candida_metapsilosis.1000.ncbiITSkraken_output')
diaporthe_ccl067_dir = os.path.join(sourcedir, 'diaporthe_ccl067.1000.ncbiITSkraken_output')
aspergillus_flavus_dir = os.path.join(sourcedir, 'aspergillus_flavus.1000.ncbiITSkraken_output')
pyrenophora_tritici_dir = os.path.join(sourcedir, 'pyrenophora_tritici-repentis.1000.ncbiITSkraken_output')
wheat_zymoseptoria_tritici_dir = os.path.join(sourcedir, 'wheat_zymoseptoria_tritici.1000.ncbiITSkraken_output')
wheat_pyrenophora_tritici_dir = os.path.join(sourcedir, 'wheat_pyrenophora_tritici-repentis.1000.ncbiITSkraken_output')
diaporthe_unidentified_dir = os.path.join(sourcedir, 'diaporthe_unidentified.1000.ncbiITSkraken_output')
scedosporium_boydii_dir = os.path.join(sourcedir, 'scedosporium_boydii.1000.ncbiITSkraken_output')
saccharomyces_cerevisiae_dir = os.path.join(sourcedir, 'saccharomyces_cerevisiae.1000.ncbiITSkraken_output')
zymoseptoria_tritici_dir = os.path.join(sourcedir, 'zymoseptoria_tritici.1000.ncbiITSkraken_output')
rhodotorula_mucilaginosa_dir = os.path.join(sourcedir, 'rhodotorula_mucilaginosa.1000.ncbiITSkraken_output')
wheat_puccinia_striiformis_dir = os.path.join(sourcedir, 'wheat_puccinia_striiformis.1000.ncbiITSkraken_output')


In [7]:
kraken_headers = ['classification', 'readid', 'taxid', 'seqlen']

In [8]:
puccinia_striiformis_df = pd.read_csv(puccinia_striiformis_dir, sep='\t', header=None)
candida_orthopsilosis_df = pd.read_csv(candida_orthopsilosis_dir, sep='\t', header=None)
candida_albicans_df = pd.read_csv(candida_albicans_dir, sep='\t', header=None)
aspergillus_niger_df = pd.read_csv(aspergillus_niger_dir, sep='\t', header=None)
candida_metapsilosis_df = pd.read_csv(candida_metapsilosis_dir, sep='\t', header=None)
diaporthe_ccl067_df = pd.read_csv(diaporthe_ccl067_dir, sep='\t', header=None)
aspergillus_flavus_df = pd.read_csv(aspergillus_flavus_dir, sep='\t', header=None)
pyrenophora_tritici_df = pd.read_csv(pyrenophora_tritici_dir, sep='\t', header=None)
wheat_zymoseptoria_tritici_df = pd.read_csv(wheat_zymoseptoria_tritici_dir, sep='\t', header=None)
wheat_pyrenophora_tritici_df = pd.read_csv(wheat_pyrenophora_tritici_dir, sep='\t', header=None)
diaporthe_unidentified_df = pd.read_csv(diaporthe_unidentified_dir, sep='\t', header=None)
scedosporium_boydii_df = pd.read_csv(scedosporium_boydii_dir, sep='\t', header=None)
saccharomyces_cerevisiae_df = pd.read_csv(saccharomyces_cerevisiae_dir, sep='\t', header=None)
zymoseptoria_tritici_df = pd.read_csv(zymoseptoria_tritici_dir, sep='\t', header=None)
rhodotorula_mucilaginosa_df = pd.read_csv(rhodotorula_mucilaginosa_dir, sep='\t', header=None)
wheat_puccinia_striiformis_df = pd.read_csv(wheat_puccinia_striiformis_dir, sep='\t', header=None)

In [9]:
df_list = [puccinia_striiformis_df, candida_orthopsilosis_df, candida_albicans_df, aspergillus_niger_df,
           candida_metapsilosis_df, diaporthe_ccl067_df, aspergillus_flavus_df, pyrenophora_tritici_df,
           wheat_zymoseptoria_tritici_df, wheat_pyrenophora_tritici_df, diaporthe_unidentified_df, scedosporium_boydii_df,
           saccharomyces_cerevisiae_df, zymoseptoria_tritici_df, rhodotorula_mucilaginosa_df, wheat_puccinia_striiformis_df]

In [10]:
for df in df_list:
    trim_df_columns(df, 4)
    df.columns = kraken_headers

In [11]:
puccinia_striiformis_uniq_df = unique_df(puccinia_striiformis_df)
candida_orthopsilosis_uniq_df = unique_df(candida_orthopsilosis_df)
candida_albicans_uniq_df = unique_df(candida_albicans_df)
aspergillus_niger_uniq_df = unique_df(aspergillus_niger_df)
candida_metapsilosis_uniq_df = unique_df(candida_metapsilosis_df)
diaporthe_ccl067_uniq_df = unique_df(diaporthe_ccl067_df)
aspergillus_flavus_uniq_df = unique_df(aspergillus_flavus_df)
pyrenophora_tritici_uniq_df = unique_df(pyrenophora_tritici_df)
wheat_zymoseptoria_tritici_uniq_df = unique_df(wheat_zymoseptoria_tritici_df)
wheat_pyrenophora_tritici_uniq_df = unique_df(wheat_pyrenophora_tritici_df)
diaporthe_unidentified_uniq_df = unique_df(diaporthe_unidentified_df)
scedosporium_boydii_uniq_df = unique_df(scedosporium_boydii_df)
saccharomyces_cerevisiae_uniq_df = unique_df(saccharomyces_cerevisiae_df)
zymoseptoria_tritici_uniq_df = unique_df(zymoseptoria_tritici_df)
rhodotorula_mucilaginosa_uniq_df = unique_df(rhodotorula_mucilaginosa_df)
wheat_puccinia_striiformis_uniq_df = unique_df(wheat_puccinia_striiformis_df)

In [12]:
rank_list = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
for rank in rank_list:
    search_rank_output_name_append_column(puccinia_striiformis_uniq_df, 
                                          puccinia_striiformis_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(candida_orthopsilosis_uniq_df, 
                                          candida_orthopsilosis_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(candida_albicans_uniq_df, 
                                          candida_albicans_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(aspergillus_niger_uniq_df, 
                                          aspergillus_niger_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(candida_metapsilosis_uniq_df, 
                                          candida_metapsilosis_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(diaporthe_ccl067_uniq_df, 
                                          diaporthe_ccl067_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(aspergillus_flavus_uniq_df, 
                                          aspergillus_flavus_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(pyrenophora_tritici_uniq_df, 
                                          pyrenophora_tritici_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(wheat_zymoseptoria_tritici_uniq_df, 
                                          wheat_zymoseptoria_tritici_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(wheat_pyrenophora_tritici_uniq_df, 
                                          wheat_pyrenophora_tritici_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(diaporthe_unidentified_uniq_df, 
                                          diaporthe_unidentified_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(scedosporium_boydii_uniq_df, 
                                          scedosporium_boydii_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(saccharomyces_cerevisiae_uniq_df, 
                                          saccharomyces_cerevisiae_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(zymoseptoria_tritici_uniq_df, 
                                          zymoseptoria_tritici_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(rhodotorula_mucilaginosa_uniq_df, 
                                          rhodotorula_mucilaginosa_uniq_df.taxid, 
                                          rank)
    search_rank_output_name_append_column(wheat_puccinia_striiformis_uniq_df, 
                                          wheat_puccinia_striiformis_uniq_df.taxid, 
                                          rank)

In [13]:
final_df = pd.DataFrame()
final_df['samples'] = species_list

In [14]:
puccinia_striiformis_uniq_df_trim = puccinia_striiformis_uniq_df[puccinia_striiformis_uniq_df.species.str.contains('Puccinia striiformis')]
candida_orthopsilosis_uniq_df_trim = candida_orthopsilosis_uniq_df[candida_orthopsilosis_uniq_df.species.str.contains('Candida orthopsilosis')]
candida_albicans_uniq_df_trim = candida_albicans_uniq_df[candida_albicans_uniq_df.species.str.contains('Candida albicans')]
aspergillus_niger_uniq_df_trim = aspergillus_niger_uniq_df[aspergillus_niger_uniq_df.species.str.contains('Aspergillus niger')]
candida_metapsilosis_uniq_df_trim = candida_metapsilosis_uniq_df[candida_metapsilosis_uniq_df.species.str.contains('Candida metapsilosis')]
diaporthe_ccl067_uniq_df_trim = diaporthe_ccl067_uniq_df[diaporthe_ccl067_uniq_df.species.str.contains('Diaporthe ccl067')]
aspergillus_flavus_uniq_df_trim = aspergillus_flavus_uniq_df[aspergillus_flavus_uniq_df.species.str.contains('Aspergillus flavus')]
pyrenophora_tritici_uniq_df_trim = pyrenophora_tritici_uniq_df[pyrenophora_tritici_uniq_df.species.str.contains('Pyrenophora tritici')]
wheat_zymoseptoria_tritici_uniq_df_trim = wheat_zymoseptoria_tritici_uniq_df[wheat_zymoseptoria_tritici_uniq_df.species.str.contains('Zymoseptoria tritici')]
wheat_pyrenophora_tritici_uniq_df_trim = wheat_pyrenophora_tritici_uniq_df[wheat_pyrenophora_tritici_uniq_df.species.str.contains('Pyrenophora tritici')]
diaporthe_unidentified_uniq_df_trim = diaporthe_unidentified_uniq_df[diaporthe_unidentified_uniq_df.genus.str.contains('Diaporthe') & diaporthe_unidentified_uniq_df.species.str.contains('Unidentified')]
scedosporium_boydii_uniq_df_trim = scedosporium_boydii_uniq_df[scedosporium_boydii_uniq_df.species.str.contains('Scedosporium boydii')]
saccharomyces_cerevisiae_uniq_df_trim = saccharomyces_cerevisiae_uniq_df[saccharomyces_cerevisiae_uniq_df.species.str.contains('Saccharomyces cerevisiae')]
zymoseptoria_tritici_uniq_df_trim = zymoseptoria_tritici_uniq_df[zymoseptoria_tritici_uniq_df.species.str.contains('Zymoseptoria tritici')]
rhodotorula_mucilaginosa_uniq_df_trim = rhodotorula_mucilaginosa_uniq_df[rhodotorula_mucilaginosa_uniq_df.species.str.contains('Rhodotorula mucilaginosa')]
wheat_puccinia_striiformis_uniq_df_trim = wheat_puccinia_striiformis_uniq_df[wheat_puccinia_striiformis_uniq_df.species.str.contains('Puccinia striiformis')]

In [15]:
df_uniq_trim_list = [puccinia_striiformis_uniq_df_trim, candida_orthopsilosis_uniq_df_trim, 
                     candida_albicans_uniq_df_trim, aspergillus_niger_uniq_df_trim,
                     candida_metapsilosis_uniq_df_trim, diaporthe_ccl067_uniq_df_trim, 
                     aspergillus_flavus_uniq_df_trim, pyrenophora_tritici_uniq_df_trim,
                     wheat_zymoseptoria_tritici_uniq_df_trim, wheat_pyrenophora_tritici_uniq_df_trim,
                     diaporthe_unidentified_uniq_df_trim, scedosporium_boydii_uniq_df_trim, 
                     saccharomyces_cerevisiae_uniq_df_trim, zymoseptoria_tritici_uniq_df_trim, 
                     rhodotorula_mucilaginosa_uniq_df_trim, wheat_puccinia_striiformis_uniq_df_trim]

In [16]:
species_frequency = [len(x)/1000 for x in df_uniq_trim_list]

In [17]:
final_df['species_accuracy'] = species_frequency

In [18]:
puccinia_striiformis_uniq_df_trim = puccinia_striiformis_uniq_df[puccinia_striiformis_uniq_df.genus.str.contains('Puccinia')]
candida_orthopsilosis_uniq_df_trim = candida_orthopsilosis_uniq_df[candida_orthopsilosis_uniq_df.genus.str.contains('Candida')]
candida_albicans_uniq_df_trim = candida_albicans_uniq_df[candida_albicans_uniq_df.genus.str.contains('Candida')]
aspergillus_niger_uniq_df_trim = aspergillus_niger_uniq_df[aspergillus_niger_uniq_df.genus.str.contains('Aspergillus')]
candida_metapsilosis_uniq_df_trim = candida_metapsilosis_uniq_df[candida_metapsilosis_uniq_df.genus.str.contains('Candida')]
diaporthe_ccl067_uniq_df_trim = diaporthe_ccl067_uniq_df[diaporthe_ccl067_uniq_df.genus.str.contains('Diaporthe')]
aspergillus_flavus_uniq_df_trim = aspergillus_flavus_uniq_df[aspergillus_flavus_uniq_df.genus.str.contains('Aspergillus')]
pyrenophora_tritici_uniq_df_trim = pyrenophora_tritici_uniq_df[pyrenophora_tritici_uniq_df.genus.str.contains('Pyrenophora')]
wheat_zymoseptoria_tritici_uniq_df_trim = wheat_zymoseptoria_tritici_uniq_df[wheat_zymoseptoria_tritici_uniq_df.genus.str.contains('Zymoseptoria')]
wheat_pyrenophora_tritici_uniq_df_trim = wheat_pyrenophora_tritici_uniq_df[wheat_pyrenophora_tritici_uniq_df.genus.str.contains('Pyrenophora')]
diaporthe_unidentified_uniq_df_trim = diaporthe_unidentified_uniq_df[diaporthe_unidentified_uniq_df.genus.str.contains('Diaporthe')]
scedosporium_boydii_uniq_df_trim = scedosporium_boydii_uniq_df[scedosporium_boydii_uniq_df.genus.str.contains('Scedosporium')]
saccharomyces_cerevisiae_uniq_df_trim = saccharomyces_cerevisiae_uniq_df[saccharomyces_cerevisiae_uniq_df.genus.str.contains('Saccharomyces')]
zymoseptoria_tritici_uniq_df_trim = zymoseptoria_tritici_uniq_df[zymoseptoria_tritici_uniq_df.genus.str.contains('Zymoseptoria')]
rhodotorula_mucilaginosa_uniq_df_trim = rhodotorula_mucilaginosa_uniq_df[rhodotorula_mucilaginosa_uniq_df.genus.str.contains('Rhodotorula')]
wheat_puccinia_striiformis_uniq_df_trim = wheat_puccinia_striiformis_uniq_df[wheat_puccinia_striiformis_uniq_df.genus.str.contains('Puccinia')]

In [19]:
df_uniq_trim_list = [puccinia_striiformis_uniq_df_trim, candida_orthopsilosis_uniq_df_trim, 
                     candida_albicans_uniq_df_trim, aspergillus_niger_uniq_df_trim,
                     candida_metapsilosis_uniq_df_trim, diaporthe_ccl067_uniq_df_trim, 
                     aspergillus_flavus_uniq_df_trim, pyrenophora_tritici_uniq_df_trim,
                     wheat_zymoseptoria_tritici_uniq_df_trim, wheat_pyrenophora_tritici_uniq_df_trim,
                     diaporthe_unidentified_uniq_df_trim, scedosporium_boydii_uniq_df_trim, 
                     saccharomyces_cerevisiae_uniq_df_trim, zymoseptoria_tritici_uniq_df_trim, 
                     rhodotorula_mucilaginosa_uniq_df_trim, wheat_puccinia_striiformis_uniq_df_trim]

In [20]:
genus_frequency = [len(x)/1000 for x in df_uniq_trim_list]

In [21]:
final_df['genus_accuracy'] = genus_frequency

In [22]:
puccinia_striiformis_uniq_df_trim = puccinia_striiformis_uniq_df[puccinia_striiformis_uniq_df.family.str.contains('Pucciniaceae')]
candida_orthopsilosis_uniq_df_trim = candida_orthopsilosis_uniq_df[candida_orthopsilosis_uniq_df.family.str.contains('Debaryomycetaceae')]
candida_albicans_uniq_df_trim = candida_albicans_uniq_df[candida_albicans_uniq_df.family.str.contains('Debaryomycetaceae')]
aspergillus_niger_uniq_df_trim = aspergillus_niger_uniq_df[aspergillus_niger_uniq_df.family.str.contains('Aspergillaceae')]
candida_metapsilosis_uniq_df_trim = candida_metapsilosis_uniq_df[candida_metapsilosis_uniq_df.family.str.contains('Debaryomycetaceae')]
diaporthe_ccl067_uniq_df_trim = diaporthe_ccl067_uniq_df[diaporthe_ccl067_uniq_df.family.str.contains('Diaporthaceae')]
aspergillus_flavus_uniq_df_trim = aspergillus_flavus_uniq_df[aspergillus_flavus_uniq_df.family.str.contains('Aspergillaceae')]
pyrenophora_tritici_uniq_df_trim = pyrenophora_tritici_uniq_df[pyrenophora_tritici_uniq_df.family.str.contains('Pleosporaceae')]
wheat_zymoseptoria_tritici_uniq_df_trim = wheat_zymoseptoria_tritici_uniq_df[wheat_zymoseptoria_tritici_uniq_df.family.str.contains('Mycosphaerellaceae')]
wheat_pyrenophora_tritici_uniq_df_trim = wheat_pyrenophora_tritici_uniq_df[wheat_pyrenophora_tritici_uniq_df.family.str.contains('Pleosporaceae')]
diaporthe_unidentified_uniq_df_trim = diaporthe_unidentified_uniq_df[diaporthe_unidentified_uniq_df.family.str.contains('Diaporthaceae')]
scedosporium_boydii_uniq_df_trim = scedosporium_boydii_uniq_df[scedosporium_boydii_uniq_df.family.str.contains('Microascaceae')]
saccharomyces_cerevisiae_uniq_df_trim = saccharomyces_cerevisiae_uniq_df[saccharomyces_cerevisiae_uniq_df.family.str.contains('Saccharomycetaceae')]
zymoseptoria_tritici_uniq_df_trim = zymoseptoria_tritici_uniq_df[zymoseptoria_tritici_uniq_df.family.str.contains('Mycosphaerellaceae')]
rhodotorula_mucilaginosa_uniq_df_trim = rhodotorula_mucilaginosa_uniq_df[rhodotorula_mucilaginosa_uniq_df.family.str.contains('Sporidiobolaceae')]
wheat_puccinia_striiformis_uniq_df_trim = wheat_puccinia_striiformis_uniq_df[wheat_puccinia_striiformis_uniq_df.family.str.contains('Pucciniaceae')]

In [23]:
df_uniq_trim_list = [puccinia_striiformis_uniq_df_trim, candida_orthopsilosis_uniq_df_trim, 
                     candida_albicans_uniq_df_trim, aspergillus_niger_uniq_df_trim,
                     candida_metapsilosis_uniq_df_trim, diaporthe_ccl067_uniq_df_trim, 
                     aspergillus_flavus_uniq_df_trim, pyrenophora_tritici_uniq_df_trim,
                     wheat_zymoseptoria_tritici_uniq_df_trim, wheat_pyrenophora_tritici_uniq_df_trim,
                     diaporthe_unidentified_uniq_df_trim, scedosporium_boydii_uniq_df_trim, 
                     saccharomyces_cerevisiae_uniq_df_trim, zymoseptoria_tritici_uniq_df_trim, 
                     rhodotorula_mucilaginosa_uniq_df_trim, wheat_puccinia_striiformis_uniq_df_trim]

In [24]:
family_frequency = [len(x)/1000 for x in df_uniq_trim_list]

In [25]:
final_df['family_accuracy'] = family_frequency

In [26]:
puccinia_striiformis_uniq_df_trim = puccinia_striiformis_uniq_df[puccinia_striiformis_uniq_df.order.str.contains('Pucciniales')]
candida_orthopsilosis_uniq_df_trim = candida_orthopsilosis_uniq_df[candida_orthopsilosis_uniq_df.order.str.contains('Saccharomycetales')]
candida_albicans_uniq_df_trim = candida_albicans_uniq_df[candida_albicans_uniq_df.order.str.contains('Saccharomycetales')]
aspergillus_niger_uniq_df_trim = aspergillus_niger_uniq_df[aspergillus_niger_uniq_df.order.str.contains('Eurotiales')]
candida_metapsilosis_uniq_df_trim = candida_metapsilosis_uniq_df[candida_metapsilosis_uniq_df.order.str.contains('Saccharomycetales')]
diaporthe_ccl067_uniq_df_trim = diaporthe_ccl067_uniq_df[diaporthe_ccl067_uniq_df.order.str.contains('Diaporthales')]
aspergillus_flavus_uniq_df_trim = aspergillus_flavus_uniq_df[aspergillus_flavus_uniq_df.order.str.contains('Eurotiales')]
pyrenophora_tritici_uniq_df_trim = pyrenophora_tritici_uniq_df[pyrenophora_tritici_uniq_df.order.str.contains('Pleosporales')]
wheat_zymoseptoria_tritici_uniq_df_trim = wheat_zymoseptoria_tritici_uniq_df[wheat_zymoseptoria_tritici_uniq_df.order.str.contains('Capnodiales')]
wheat_pyrenophora_tritici_uniq_df_trim = wheat_pyrenophora_tritici_uniq_df[wheat_pyrenophora_tritici_uniq_df.order.str.contains('Pleosporales')]
diaporthe_unidentified_uniq_df_trim = diaporthe_unidentified_uniq_df[diaporthe_unidentified_uniq_df.order.str.contains('Diaporthales')]
scedosporium_boydii_uniq_df_trim = scedosporium_boydii_uniq_df[scedosporium_boydii_uniq_df.order.str.contains('Microascales')]
saccharomyces_cerevisiae_uniq_df_trim = saccharomyces_cerevisiae_uniq_df[saccharomyces_cerevisiae_uniq_df.order.str.contains('Saccharomycetales')]
zymoseptoria_tritici_uniq_df_trim = zymoseptoria_tritici_uniq_df[zymoseptoria_tritici_uniq_df.order.str.contains('Capnodiales')]
rhodotorula_mucilaginosa_uniq_df_trim = rhodotorula_mucilaginosa_uniq_df[rhodotorula_mucilaginosa_uniq_df.order.str.contains('Sporidiobolales')]
wheat_puccinia_striiformis_uniq_df_trim = wheat_puccinia_striiformis_uniq_df[wheat_puccinia_striiformis_uniq_df.order.str.contains('Pucciniales')]

In [27]:
df_uniq_trim_list = [puccinia_striiformis_uniq_df_trim, candida_orthopsilosis_uniq_df_trim, 
                     candida_albicans_uniq_df_trim, aspergillus_niger_uniq_df_trim,
                     candida_metapsilosis_uniq_df_trim, diaporthe_ccl067_uniq_df_trim, 
                     aspergillus_flavus_uniq_df_trim, pyrenophora_tritici_uniq_df_trim,
                     wheat_zymoseptoria_tritici_uniq_df_trim, wheat_pyrenophora_tritici_uniq_df_trim,
                     diaporthe_unidentified_uniq_df_trim, scedosporium_boydii_uniq_df_trim, 
                     saccharomyces_cerevisiae_uniq_df_trim, zymoseptoria_tritici_uniq_df_trim, 
                     rhodotorula_mucilaginosa_uniq_df_trim, wheat_puccinia_striiformis_uniq_df_trim]

In [28]:
order_frequency = [len(x)/1000 for x in df_uniq_trim_list]

In [29]:
final_df['order_accuracy'] = order_frequency

In [30]:
puccinia_striiformis_uniq_df_trim = puccinia_striiformis_uniq_df[puccinia_striiformis_uniq_df['class'].str.contains('Pucciniomycetes')]
candida_orthopsilosis_uniq_df_trim = candida_orthopsilosis_uniq_df[candida_orthopsilosis_uniq_df['class'].str.contains('Saccharomycetes')]
candida_albicans_uniq_df_trim = candida_albicans_uniq_df[candida_albicans_uniq_df['class'].str.contains('Saccharomycetes')]
aspergillus_niger_uniq_df_trim = aspergillus_niger_uniq_df[aspergillus_niger_uniq_df['class'].str.contains('Eurotiomycetes')]
candida_metapsilosis_uniq_df_trim = candida_metapsilosis_uniq_df[candida_metapsilosis_uniq_df['class'].str.contains('Saccharomycetes')]
diaporthe_ccl067_uniq_df_trim = diaporthe_ccl067_uniq_df[diaporthe_ccl067_uniq_df['class'].str.contains('Sordariomycetes')]
aspergillus_flavus_uniq_df_trim = aspergillus_flavus_uniq_df[aspergillus_flavus_uniq_df['class'].str.contains('Eurotiomycetes')]
pyrenophora_tritici_uniq_df_trim = pyrenophora_tritici_uniq_df[pyrenophora_tritici_uniq_df['class'].str.contains('Dothideomycetes')]
wheat_zymoseptoria_tritici_uniq_df_trim = wheat_zymoseptoria_tritici_uniq_df[wheat_zymoseptoria_tritici_uniq_df['class'].str.contains('Dothideomycetes')]
wheat_pyrenophora_tritici_uniq_df_trim = wheat_pyrenophora_tritici_uniq_df[wheat_pyrenophora_tritici_uniq_df['class'].str.contains('Dothideomycetes')]
diaporthe_unidentified_uniq_df_trim = diaporthe_unidentified_uniq_df[diaporthe_unidentified_uniq_df['class'].str.contains('Sordariomycetes')]
scedosporium_boydii_uniq_df_trim = scedosporium_boydii_uniq_df[scedosporium_boydii_uniq_df['class'].str.contains('Sordariomycetes')]
saccharomyces_cerevisiae_uniq_df_trim = saccharomyces_cerevisiae_uniq_df[saccharomyces_cerevisiae_uniq_df['class'].str.contains('Saccharomycetes')]
zymoseptoria_tritici_uniq_df_trim = zymoseptoria_tritici_uniq_df[zymoseptoria_tritici_uniq_df['class'].str.contains('Dothideomycetes')]
rhodotorula_mucilaginosa_uniq_df_trim = rhodotorula_mucilaginosa_uniq_df[rhodotorula_mucilaginosa_uniq_df['class'].str.contains('Sporidiobolales')]
wheat_puccinia_striiformis_uniq_df_trim = wheat_puccinia_striiformis_uniq_df[wheat_puccinia_striiformis_uniq_df['class'].str.contains('Pucciniomycetes')]

In [31]:
df_uniq_trim_list = [puccinia_striiformis_uniq_df_trim, candida_orthopsilosis_uniq_df_trim, 
                     candida_albicans_uniq_df_trim, aspergillus_niger_uniq_df_trim,
                     candida_metapsilosis_uniq_df_trim, diaporthe_ccl067_uniq_df_trim, 
                     aspergillus_flavus_uniq_df_trim, pyrenophora_tritici_uniq_df_trim,
                     wheat_zymoseptoria_tritici_uniq_df_trim, wheat_pyrenophora_tritici_uniq_df_trim,
                     diaporthe_unidentified_uniq_df_trim, scedosporium_boydii_uniq_df_trim, 
                     saccharomyces_cerevisiae_uniq_df_trim, zymoseptoria_tritici_uniq_df_trim, 
                     rhodotorula_mucilaginosa_uniq_df_trim, wheat_puccinia_striiformis_uniq_df_trim]

In [32]:
class_frequency = [len(x)/1000 for x in df_uniq_trim_list]

In [33]:
final_df['class_accuracy'] = class_frequency

In [34]:
puccinia_striiformis_uniq_df_trim = puccinia_striiformis_uniq_df[puccinia_striiformis_uniq_df.phylum.str.contains('Basidiomycota')]
candida_orthopsilosis_uniq_df_trim = candida_orthopsilosis_uniq_df[candida_orthopsilosis_uniq_df.phylum.str.contains('Ascomycota')]
candida_albicans_uniq_df_trim = candida_albicans_uniq_df[candida_albicans_uniq_df.phylum.str.contains('Ascomycota')]
aspergillus_niger_uniq_df_trim = aspergillus_niger_uniq_df[aspergillus_niger_uniq_df.phylum.str.contains('Ascomycota')]
candida_metapsilosis_uniq_df_trim = candida_metapsilosis_uniq_df[candida_metapsilosis_uniq_df.phylum.str.contains('Ascomycota')]
diaporthe_ccl067_uniq_df_trim = diaporthe_ccl067_uniq_df[diaporthe_ccl067_uniq_df.phylum.str.contains('Ascomycota')]
aspergillus_flavus_uniq_df_trim = aspergillus_flavus_uniq_df[aspergillus_flavus_uniq_df.phylum.str.contains('Ascomycota')]
pyrenophora_tritici_uniq_df_trim = pyrenophora_tritici_uniq_df[pyrenophora_tritici_uniq_df.phylum.str.contains('Ascomycota')]
wheat_zymoseptoria_tritici_uniq_df_trim = wheat_zymoseptoria_tritici_uniq_df[wheat_zymoseptoria_tritici_uniq_df.phylum.str.contains('Ascomycota')]
wheat_pyrenophora_tritici_uniq_df_trim = wheat_pyrenophora_tritici_uniq_df[wheat_pyrenophora_tritici_uniq_df.phylum.str.contains('Ascomycota')]
diaporthe_unidentified_uniq_df_trim = diaporthe_unidentified_uniq_df[diaporthe_unidentified_uniq_df.phylum.str.contains('Ascomycota')]
scedosporium_boydii_uniq_df_trim = scedosporium_boydii_uniq_df[scedosporium_boydii_uniq_df.phylum.str.contains('Ascomycota')]
saccharomyces_cerevisiae_uniq_df_trim = saccharomyces_cerevisiae_uniq_df[saccharomyces_cerevisiae_uniq_df.phylum.str.contains('Ascomycota')]
zymoseptoria_tritici_uniq_df_trim = zymoseptoria_tritici_uniq_df[zymoseptoria_tritici_uniq_df.phylum.str.contains('Ascomycota')]
rhodotorula_mucilaginosa_uniq_df_trim = rhodotorula_mucilaginosa_uniq_df[rhodotorula_mucilaginosa_uniq_df.phylum.str.contains('Basidiomycota')]
wheat_puccinia_striiformis_uniq_df_trim = wheat_puccinia_striiformis_uniq_df[wheat_puccinia_striiformis_uniq_df.phylum.str.contains('Basidiomycota')]

In [35]:
df_uniq_trim_list = [puccinia_striiformis_uniq_df_trim, candida_orthopsilosis_uniq_df_trim, 
                     candida_albicans_uniq_df_trim, aspergillus_niger_uniq_df_trim,
                     candida_metapsilosis_uniq_df_trim, diaporthe_ccl067_uniq_df_trim, 
                     aspergillus_flavus_uniq_df_trim, pyrenophora_tritici_uniq_df_trim,
                     wheat_zymoseptoria_tritici_uniq_df_trim, wheat_pyrenophora_tritici_uniq_df_trim,
                     diaporthe_unidentified_uniq_df_trim, scedosporium_boydii_uniq_df_trim, 
                     saccharomyces_cerevisiae_uniq_df_trim, zymoseptoria_tritici_uniq_df_trim, 
                     rhodotorula_mucilaginosa_uniq_df_trim, wheat_puccinia_striiformis_uniq_df_trim]

In [36]:
phylum_frequency = [len(x)/1000 for x in df_uniq_trim_list]

In [37]:
final_df['phylum_accuracy'] = phylum_frequency

In [None]:
final_df.to_csv(os.path.join(sourcedir, 'rank_accuracy_ncbiITS.txt'), sep='\t', header=True)

In [38]:
final_df

Unnamed: 0,samples,species_accuracy,genus_accuracy,family_accuracy,order_accuracy,class_accuracy,phylum_accuracy
0,puccinia_striiformis,0.0,0.623,0.641,0.666,0.666,0.713
1,candida_orthopsilosis,0.481,0.66,0.679,0.782,0.782,0.868
2,candida_albicans,0.282,0.484,0.513,0.77,0.77,0.83
3,aspergillus_niger,0.0,0.73,0.79,0.799,0.847,0.934
4,candida_metapsilosis,0.007,0.646,0.666,0.794,0.794,0.869
5,diaporthe_ccl067,0.0,0.002,0.002,0.803,0.86,0.935
6,aspergillus_flavus,0.004,0.77,0.813,0.823,0.864,0.934
7,pyrenophora_tritici-repentis,0.0,0.456,0.625,0.659,0.696,0.853
8,wheat_zymoseptoria_tritici,0.173,0.21,0.34,0.353,0.428,0.532
9,wheat_pyrenophora_tritici-repentis,0.0,0.203,0.272,0.299,0.31,0.379
