### Notebook to analyzse the efficiency of minimap mapping against a mock community

Starting points from Tavish  
reference_dataframe at '/media/MassStorage/tmp/TE/honours/analysis/Stats/reference_dataframe.csv'  
custom_database at '/media/MassStorage/tmp/TE/honours/database/custom_database_labelled.fasta'  
taxonomy_file at '/media/MassStorage/tmp/TE/honours/analysis/Stats/taxonomy_file.csv'

#### workflow

* Two databases
* subsample 15000 reads per each mock community species. Save those out.
* map reads against both databases with minimap safe out data in paf format.
* get best hit per species (see what this means while looking at the data).
* add the full taxonomy to each best match using the taxonomy file.
* summarize data at different taxonomic ranks for each species.
* pull this all together somehow (summary across all the samples? focus on species of interest e.g. deleted from analyis?)

#### requirment

* Bbmap (conda install bbmap https://anaconda.org/bioconda/bbmap)  
* minimap2 (conda install minimap2 https://anaconda.org/bioconda/minimap2)

#### fixes on the command line

* fixed the taxonomy file to fit with the quiime format

cat /media/MassStorage/tmp/TE/honours/analysis/Stats/taxonomy_file.csv | sed 's/,/\t/' > /media/MassStorage/tmp/TE/honours/analysis/Stats/taxonomy_file_v2.csv

#### fix the taxonomy_file_v2.csv more to reflect Qiime style

* This requires to make the species name genus_species and not _species.... hope this makes sense

In [107]:
import re
old_taxonomy_file_fn = '/media/MassStorage/tmp/TE/honours/analysis/Stats/taxonomy_file_v2.csv'
new_taxonomy_file_fn = '/media/MassStorage/tmp/TE/honours/analysis/Stats/taxonomy_file_v3.csv'
with open(new_taxonomy_file_fn, 'w') as out_fh:
    with open(old_taxonomy_file_fn, 'r') as in_fh:
        for line in in_fh:
            line = line.rstrip()
            #print(line)
            first_half = line.split('s__')[0]
            second_half = line.split('s__')[1]
            pattern = re.compile(r'g__\w+;')
            genus = re.findall(pattern, first_half)[0].replace('g__','').replace(';','')
            new_line = F"{first_half}s__{genus}_{second_half}"
            print(new_line,file=out_fh)

In [108]:
!head -2 {old_taxonomy_file_fn}

20171103_FAH15473/barcode01	k__Fungi;p__Basidiomycota;c__Pucciniomycetes;o__Pucciniales;f__Pucciniaceae;g__Puccinia;s__striiformis-tritici
20171103_FAH15473/barcode02	k__Fungi;p__Ascomycota;c__Dothideomycetes;o__Capnodiales;f__Mycosphaerellaceae;g__Zymoseptoria;s__tritici


In [109]:
!head -2 {new_taxonomy_file_fn}

20171103_FAH15473/barcode01	k__Fungi;p__Basidiomycota;c__Pucciniomycetes;o__Pucciniales;f__Pucciniaceae;g__Puccinia;s__Puccinia_striiformis-tritici
20171103_FAH15473/barcode02	k__Fungi;p__Ascomycota;c__Dothideomycetes;o__Capnodiales;f__Mycosphaerellaceae;g__Zymoseptoria;s__Zymoseptoria_tritici


In [110]:
from Bio import SeqIO
import os
import random
import subprocess
import pandas as pd

#### Initial data

In [111]:
reference_dataframe_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/reference_dataframe.csv')
max_custom_database_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/database/custom_database_labelled.fasta')
taxonomy_file_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/taxonomy_file_qiime.csv')

In [112]:
#threads to use
threads = 6

In [113]:
INPUT_BASEDIR = os.path.abspath('/media/MassStorage/tmp/TE/honours')

In [114]:
OUT_DIR = os.path.abspath('../../analysis/Mapping_mock_gsref')
if not os.path.exists(OUT_DIR):
    os.mkdir(OUT_DIR)

In [115]:
### list of species in the max database
max_species = ['Puccinia_striiformis-tritici',
             'Zymoseptoria_tritici',
             'Pyrenophora_tritici-repentis',
             'Fusarium_oxysporum',
             'Tuber_brumale',
             'Cortinarius_globuliformis',
             'Aspergillus_niger',
             'Clavispora_lusitaniae',
             'Kluyveromyces_unidentified',
             'Penicillium_chrysogenum',
             'Rhodotorula_mucilaginosa',
             'Scedosporium_boydii',
             'Blastobotrys_proliferans',
             'Debaryomyces_unidentified',
             'Galactomyces_geotrichum',
             'Kodamaea_ohmeri',
             'Meyerozyma_guilliermondii',
             'Wickerhamomyces_anomalus',
             'Yamadazyma_mexicana',
             'Yamadazyma_scolyti',
             'Yarrowia_lipolytica',
             'Zygoascus_hellenicus',
             'Aspergillus_flavus',
             'Cryptococcus_zero',
             'Aspergillus_unidentified',
             'Diaporthe_CCL067',
             'Diaporthe_unidentified',
             'Oculimacula_yallundae-CCL031',
             'Oculimacula_yallundae-CCL029',
             'Dothiorella_vidmadera',
             'Quambalaria_cyanescens',
             'Entoleuca_unidentified',
             'Asteroma_CCL060',
             'Asteroma_CCL068',
             'Saccharomyces_cerevisiae',
             'Cladophialophora_unidentified',
             'Candida_albicans',
             'Candida_metapsilosis',
             'Candida_orthopsilosis',
             'Candida_parapsilosis',
             'Candida_unidentified',
             'Kluyveromyces_marxianus',
             'Pichia_kudriavzevii',
             'Pichia_membranifaciens']

In [116]:
# ###Removed from second test databes
species_delete = [
# 'Candida_orthopsilosis',
#                  'Candida_metapsilosis',
#                  'Aspergillus_niger'
]

In [117]:
###species to be searched against both databases
# mock_community = ['Penicillium_chrysogenum',
#  'Aspergillus_flavus',
#  'Aspergillus_niger',
#  'Pichia_kudriavzevii',
#  'Pichia_membranifaciens',
#  'Candida_albicans',
#  'Candida_parapsilosis',
#  'Candida_orthopsilosis',
#  'Candida_metapsilosis']

mock_community = ['Puccinia_striiformis-tritici',
             'Zymoseptoria_tritici',
             'Pyrenophora_tritici-repentis',
             'Fusarium_oxysporum',
             'Tuber_brumale',
             'Cortinarius_globuliformis',
             'Aspergillus_niger',
             'Clavispora_lusitaniae',
             'Kluyveromyces_unidentified',
             'Penicillium_chrysogenum',
             'Rhodotorula_mucilaginosa',
             'Scedosporium_boydii',
             'Blastobotrys_proliferans',
             'Debaryomyces_unidentified',
             'Galactomyces_geotrichum',
             'Kodamaea_ohmeri',
             'Meyerozyma_guilliermondii',
             'Wickerhamomyces_anomalus',
             'Yamadazyma_mexicana',
             'Yamadazyma_scolyti',
             'Yarrowia_lipolytica',
             'Zygoascus_hellenicus',
             'Aspergillus_flavus',
             'Cryptococcus_zero',
             'Aspergillus_unidentified',
             'Diaporthe_CCL067',
             'Diaporthe_unidentified',
             'Oculimacula_yallundae-CCL031',
             'Oculimacula_yallundae-CCL029',
             'Dothiorella_vidmadera',
             'Quambalaria_cyanescens',
             'Entoleuca_unidentified',
             'Asteroma_CCL060',
             'Asteroma_CCL068',
             'Saccharomyces_cerevisiae',
             'Cladophialophora_unidentified',
             'Candida_albicans',
             'Candida_metapsilosis',
             'Candida_orthopsilosis',
             'Candida_parapsilosis',
             'Candida_unidentified',
             'Kluyveromyces_marxianus',
             'Pichia_kudriavzevii',
             'Pichia_membranifaciens']

In [118]:
# fixed_old_names = ['Kluyveromyces_lactis',
#                    'Candida_zeylanoides',
#                    'Cladophialophora_sp.',
#                    'Diaporthe_sp.',
#                    'CCL060',
#                    'CCL068',
#                    'CCL067',
#                    'Aspergillus_sp.',
#                    'Entoleuca_sp.',
#                    'Tapesia_yallundae_CCL029',
#                    'Tapesia_yallundae_CCL031',
#                    'Cryptococcus_neoformans']

In [119]:
# fixed_new_names = ['candida_unidentified',
#                    'debaryomyces_unidentified',
#                    'cladophialophora_unidentified',
#                    'diaporthe_unidentified',
#                    'asteroma_ccl060',
#                    'asteroma_ccl068',
#                    'diaporthe_ccl067',
#                    'aspergillus_unidentified',
#                    'entoleuca_unidentified',
#                    'oculimacula_yallundae-ccl029',
#                    'oculimacula_yallundae-ccl031',
#                    'kluyveromyces_unidentified']

In [120]:
# old_to_new_names = dict(zip(fixed_old_names, fixed_new_names))

In [121]:
# old_to_new_names

### Fix databases and names

In [122]:
ref_df = pd.read_csv(reference_dataframe_fn)
ref_df['name_species'] = ref_df['genus'] +"_"+ ref_df['species']

In [123]:
ref_df.name_species.tolist()

['puccinia_striiformis-tritici',
 'zymoseptoria_tritici',
 'pyrenophora_tritici-repentis',
 'fusarium_oxysporum',
 'tuber_brumale',
 'cortinarius_globuliformis',
 'aspergillus_niger',
 'clavispora_lusitaniae',
 'kluyveromyces_unidentified',
 'penicillium_chrysogenum',
 'rhodotorula_mucilaginosa',
 'scedosporium_boydii',
 'blastobotrys_proliferans',
 'debaryomyces_unidentified',
 'galactomyces_geotrichum',
 'kodamaea_ohmeri',
 'meyerozyma_guilliermondii',
 'wickerhamomyces_anomalus',
 'yamadazyma_mexicana',
 'yamadazyma_scolyti',
 'yarrowia_lipolytica',
 'zygoascus_hellenicus',
 'aspergillus_flavus',
 'cryptococcus_zero',
 'aspergillus_unidentified',
 'diaporthe_ccl067',
 'diaporthe_unidentified',
 'oculimacula_yallundae-ccl031',
 'oculimacula_yallundae-ccl029',
 'dothiorella_vidmadera',
 'quambalaria_cyanescens',
 'entoleuca_unidentified',
 'asteroma_ccl060',
 'asteroma_ccl068',
 'saccharomyces_cerevisiae',
 'cladophialophora_unidentified',
 'candida_albicans',
 'candida_metapsilosis',

In [124]:
new_db_fn = os.path.join(OUT_DIR, 'gsref.db.fasta')

In [125]:
new_db_list = []
old_db_list = []
for seq in SeqIO.parse(max_custom_database_fn, 'fasta'):
    old_db_list.append(seq.id)
    if seq.id.lower() in ref_df.name_species.tolist():
        #print(seq.id)
        seq.id = seq.name = seq.description = seq.id.lower()
        new_db_list.append(seq)
    else:
        print(seq.id)

In [126]:
if len(new_db_list) == len(old_db_list):
    SeqIO.write(new_db_list, new_db_fn, 'fasta')
else:
    print("please check!")

In [127]:
sub_db_fn = os.path.join(OUT_DIR, 'gsref.subdb.fasta')
sub_db_list = []
for seq in new_db_list:
    if seq.id not in [x.lower() for x in species_delete]:
        sub_db_list.append(seq)

In [128]:
if len(sub_db_list) + len(species_delete) == len(new_db_list):
    SeqIO.write(sub_db_list, sub_db_fn, 'fasta' )
else:
    print("please check!")

In [129]:
[x.id for x in sub_db_list]

['puccinia_striiformis-tritici',
 'zymoseptoria_tritici',
 'pyrenophora_tritici-repentis',
 'fusarium_oxysporum',
 'tuber_brumale',
 'cortinarius_globuliformis',
 'aspergillus_niger',
 'clavispora_lusitaniae',
 'kluyveromyces_unidentified',
 'penicillium_chrysogenum',
 'rhodotorula_mucilaginosa',
 'scedosporium_boydii',
 'blastobotrys_proliferans',
 'debaryomyces_unidentified',
 'galactomyces_geotrichum',
 'kodamaea_ohmeri',
 'meyerozyma_guilliermondii',
 'wickerhamomyces_anomalus',
 'yamadazyma_mexicana',
 'yamadazyma_scolyti',
 'yarrowia_lipolytica',
 'zygoascus_hellenicus',
 'aspergillus_flavus',
 'cryptococcus_zero',
 'aspergillus_unidentified',
 'diaporthe_ccl067',
 'diaporthe_unidentified',
 'oculimacula_yallundae-ccl031',
 'oculimacula_yallundae-ccl029',
 'dothiorella_vidmadera',
 'quambalaria_cyanescens',
 'entoleuca_unidentified',
 'asteroma_ccl060',
 'asteroma_ccl068',
 'saccharomyces_cerevisiae',
 'cladophialophora_unidentified',
 'candida_albicans',
 'candida_metapsilosis',

In [130]:
mock_community = [x.lower() for x in mock_community]

In [131]:
mock_community

['puccinia_striiformis-tritici',
 'zymoseptoria_tritici',
 'pyrenophora_tritici-repentis',
 'fusarium_oxysporum',
 'tuber_brumale',
 'cortinarius_globuliformis',
 'aspergillus_niger',
 'clavispora_lusitaniae',
 'kluyveromyces_unidentified',
 'penicillium_chrysogenum',
 'rhodotorula_mucilaginosa',
 'scedosporium_boydii',
 'blastobotrys_proliferans',
 'debaryomyces_unidentified',
 'galactomyces_geotrichum',
 'kodamaea_ohmeri',
 'meyerozyma_guilliermondii',
 'wickerhamomyces_anomalus',
 'yamadazyma_mexicana',
 'yamadazyma_scolyti',
 'yarrowia_lipolytica',
 'zygoascus_hellenicus',
 'aspergillus_flavus',
 'cryptococcus_zero',
 'aspergillus_unidentified',
 'diaporthe_ccl067',
 'diaporthe_unidentified',
 'oculimacula_yallundae-ccl031',
 'oculimacula_yallundae-ccl029',
 'dothiorella_vidmadera',
 'quambalaria_cyanescens',
 'entoleuca_unidentified',
 'asteroma_ccl060',
 'asteroma_ccl068',
 'saccharomyces_cerevisiae',
 'cladophialophora_unidentified',
 'candida_albicans',
 'candida_metapsilosis',

### Subsample reads

In [132]:
def subsamplereads(in_fn, out_fn, n_reads):
    command = F'reformat.sh samplereadstarget={n_reads} in={in_fn} out={out_fn}'
    out = subprocess.getstatusoutput(command)
    if out[0] == 0:
        print(F":)Completed {command}\n")
    else:
        print(F":(check one {command}!!\n")

In [133]:
n_reads = 15000

In [134]:
MC_READ_DIR = os.path.join(OUT_DIR, 'MC_READS')
if not os.path.exists(MC_READ_DIR):
    os.mkdir(MC_READ_DIR)

In [135]:
ref_df.columns

Index(['Unnamed: 0', 'species', 'genus', 'family', 'order', 'class', 'phylum',
       'kingdom', '# raw reads', '# reads after homology filtering',
       '# reads after length filtering', '# for use', 'path to raw reads',
       'path to homology filtering', 'path to length filtering',
       'path for use', 'name_species'],
      dtype='object')

In [136]:
fn_subsampling = {}
for x in mock_community:
    fn_subsampling[x] = (ref_df[(ref_df['species'] == x.split('_')[1]) & (ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])
fn_subsampling

{'puccinia_striiformis-tritici': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode01/length_restricted_for_use.fasta',
 'zymoseptoria_tritici': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode02/length_restricted_for_use.fasta',
 'pyrenophora_tritici-repentis': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode03/length_restricted_for_use.fasta',
 'fusarium_oxysporum': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode04/length_restricted_for_use.fasta',
 'tuber_brumale': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode05/length_restricted_for_use.fasta',
 'cortinarius_globuliformis': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode06/length_restricted_for_use.fasta',
 'aspergillus_niger': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcod

In [137]:
sub_reads_fn = {}
for key, value in fn_subsampling.items():
    species = key
    in_fn = value
    out_fn = os.path.join(MC_READ_DIR, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode01/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/puccinia_striiformis-tritici.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode02/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/zymoseptoria_tritici.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode03/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/pyrenophora_tritici-repentis.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode04/len

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171212_FAH18688/barcode08/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/quambalaria_cyanescens.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171212_FAH18688/barcode09/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/entoleuca_unidentified.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171212_FAH18688/barcode11/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/asteroma_ccl060.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171212_FAH18688/barcode12/length_restricted_fo

### Map with minimap against both databases

In [138]:
def minimapmapping(fasta_fn, ref_fn, out_fn, threads):
    command = F"minimap2 -x map-ont -t {threads} {ref_fn} {fasta_fn} -o {out_fn}"
    out = subprocess.getstatusoutput(command)
    if out[0] == 0:
        print(F":)Completed {command}\n")
    else:
        print(F":(check one {command}!!\n")

In [139]:
dbases_fn = {}
for x in [sub_db_fn, new_db_fn]:
    dbases_fn[x] = os.path.join(OUT_DIR, os.path.basename(x).replace('.fasta', '').replace('.','_'))
    if not os.path.exists(dbases_fn[x]):
        os.mkdir(dbases_fn[x])
dbases_fn

{'/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta': '/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb',
 '/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta': '/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db'}

In [140]:
db_fn = sub_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    tmp_out = dbases_fn[db_fn]
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(tmp_out, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/puccinia_striiformis-tritici.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb/gsref.subdb.puccinia_striiformis-tritici.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/zymoseptoria_tritici.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb/gsref.subdb.zymoseptoria_tritici.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/pyrenophora_tritici-repentis.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/aspergillus_unidentified.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb/gsref.subdb.aspergillus_unidentified.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/diaporthe_ccl067.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb/gsref.subdb.diaporthe_ccl067.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/diaporthe_unidentified.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb/gsref.subd

In [141]:
db_fn = new_db_fn
new_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    tmp_out = dbases_fn[db_fn]
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(tmp_out, F"{db_name}.{species}.minimap2.paf")
    new_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/puccinia_striiformis-tritici.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.puccinia_striiformis-tritici.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/zymoseptoria_tritici.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.zymoseptoria_tritici.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/pyrenophora_tritici-repentis.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.pyr

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/diaporthe_ccl067.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.diaporthe_ccl067.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/diaporthe_unidentified.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.diaporthe_unidentified.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/oculimacula_yallundae-ccl031.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.oculimacula_yallundae-c

### Look at mapping results

In [142]:
def mapping_results(fn, species):
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    sub_df = tmp_df[tmp_df['mquality'] == tmp_df.groupby('qseqid')['mquality'].transform(max)].reset_index(drop=True)
    sub_df = sub_df[sub_df['nmatch'] == sub_df.groupby('qseqid')['nmatch'].transform(max)].reset_index(drop=True)
    hit_series = pd.Series(sub_df.groupby('tname')['mquality'].count().tolist()/sub_df.groupby('tname')['mquality'].count().sum(),
                      sub_df.groupby('tname')['mquality'].count().index)
    hit_series.sort_values(ascending=False, inplace=True)
    print(sub_df.qseqid.unique().shape == tmp_df.qseqid.unique().shape)
    print('##########\n')
    print(F"This was the query species: {species}\n")
    print(F"These are the results:")
    print(hit_series,'\n')

In [143]:
###this is running the reads against the full database
for species, hit_fn in new_db_mapping_fn.items():
    mapping_results(hit_fn, species)

True
##########

This was the query species: puccinia_striiformis-tritici

These are the results:
tname
puccinia_striiformis-tritici    0.998531
pyrenophora_tritici-repentis    0.000467
zymoseptoria_tritici            0.000134
penicillium_chrysogenum         0.000134
cortinarius_globuliformis       0.000134
clavispora_lusitaniae           0.000134
aspergillus_niger               0.000134
wickerhamomyces_anomalus        0.000067
tuber_brumale                   0.000067
scedosporium_boydii             0.000067
rhodotorula_mucilaginosa        0.000067
meyerozyma_guilliermondii       0.000067
dtype: float64 

True
##########

This was the query species: zymoseptoria_tritici

These are the results:
tname
zymoseptoria_tritici             0.996068
pyrenophora_tritici-repentis     0.001266
penicillium_chrysogenum          0.000533
tuber_brumale                    0.000267
fusarium_oxysporum               0.000200
puccinia_striiformis-tritici     0.000133
kluyveromyces_unidentified       0.0001

True
##########

This was the query species: kluyveromyces_unidentified

These are the results:
tname
kluyveromyces_unidentified       0.488380
kluyveromyces_marxianus          0.425123
cryptococcus_zero                0.059735
cortinarius_globuliformis        0.004610
rhodotorula_mucilaginosa         0.002625
entoleuca_unidentified           0.002049
quambalaria_cyanescens           0.001473
meyerozyma_guilliermondii        0.001409
candida_unidentified             0.001345
saccharomyces_cerevisiae         0.001280
wickerhamomyces_anomalus         0.000960
yarrowia_lipolytica              0.000960
oculimacula_yallundae-ccl029     0.000960
candida_metapsilosis             0.000768
candida_orthopsilosis            0.000704
clavispora_lusitaniae            0.000704
yamadazyma_mexicana              0.000704
zymoseptoria_tritici             0.000576
candida_albicans                 0.000512
debaryomyces_unidentified        0.000512
candida_parapsilosis             0.000384
penicillium_chry

True
##########

This was the query species: meyerozyma_guilliermondii

These are the results:
tname
meyerozyma_guilliermondii       0.994081
debaryomyces_unidentified       0.000931
yamadazyma_mexicana             0.000665
yamadazyma_scolyti              0.000532
candida_orthopsilosis           0.000399
wickerhamomyces_anomalus        0.000333
candida_metapsilosis            0.000333
zygoascus_hellenicus            0.000266
candida_albicans                0.000266
candida_unidentified            0.000266
kodamaea_ohmeri                 0.000266
kluyveromyces_unidentified      0.000200
candida_parapsilosis            0.000200
scedosporium_boydii             0.000200
pichia_kudriavzevii             0.000133
yarrowia_lipolytica             0.000133
kluyveromyces_marxianus         0.000133
galactomyces_geotrichum         0.000133
aspergillus_flavus              0.000133
puccinia_striiformis-tritici    0.000067
pyrenophora_tritici-repentis    0.000067
rhodotorula_mucilaginosa        0.0000

True
##########

This was the query species: diaporthe_unidentified

These are the results:
tname
diaporthe_unidentified          0.702311
asteroma_ccl060                 0.288997
entoleuca_unidentified          0.001646
asteroma_ccl068                 0.001119
diaporthe_ccl067                0.001119
kodamaea_ohmeri                 0.000922
fusarium_oxysporum              0.000593
oculimacula_yallundae-ccl029    0.000461
dothiorella_vidmadera           0.000329
tuber_brumale                   0.000329
cryptococcus_zero               0.000329
scedosporium_boydii             0.000263
quambalaria_cyanescens          0.000263
oculimacula_yallundae-ccl031    0.000263
aspergillus_unidentified        0.000198
kluyveromyces_unidentified      0.000132
yamadazyma_scolyti              0.000132
saccharomyces_cerevisiae        0.000132
puccinia_striiformis-tritici    0.000066
pyrenophora_tritici-repentis    0.000066
cortinarius_globuliformis       0.000066
clavispora_lusitaniae           0.000066


True
##########

This was the query species: saccharomyces_cerevisiae

These are the results:
tname
saccharomyces_cerevisiae         0.992092
kluyveromyces_unidentified       0.002259
kluyveromyces_marxianus          0.002126
debaryomyces_unidentified        0.000532
wickerhamomyces_anomalus         0.000465
candida_metapsilosis             0.000332
yamadazyma_mexicana              0.000266
candida_orthopsilosis            0.000266
pichia_membranifaciens           0.000199
meyerozyma_guilliermondii        0.000199
candida_parapsilosis             0.000133
cladophialophora_unidentified    0.000133
clavispora_lusitaniae            0.000133
zygoascus_hellenicus             0.000133
blastobotrys_proliferans         0.000133
dothiorella_vidmadera            0.000066
cryptococcus_zero                0.000066
yamadazyma_scolyti               0.000066
kodamaea_ohmeri                  0.000066
candida_unidentified             0.000066
pichia_kudriavzevii              0.000066
rhodotorula_mucila

True
##########

This was the query species: candida_unidentified

These are the results:
tname
candida_unidentified             0.882645
candida_albicans                 0.100478
candida_orthopsilosis            0.003140
candida_metapsilosis             0.002355
candida_parapsilosis             0.001635
oculimacula_yallundae-ccl029     0.001047
entoleuca_unidentified           0.001047
meyerozyma_guilliermondii        0.000981
kodamaea_ohmeri                  0.000654
yamadazyma_scolyti               0.000654
debaryomyces_unidentified        0.000589
yamadazyma_mexicana              0.000589
wickerhamomyces_anomalus         0.000589
saccharomyces_cerevisiae         0.000589
kluyveromyces_unidentified       0.000392
clavispora_lusitaniae            0.000327
tuber_brumale                    0.000262
yarrowia_lipolytica              0.000262
blastobotrys_proliferans         0.000196
kluyveromyces_marxianus          0.000131
zygoascus_hellenicus             0.000131
pichia_membranifaciens

In [144]:
###this is running against a database that have ['Candida_orthopsilosis', 'Candida_metapsilosis', 'Aspergillus_niger'] deleted
for species, hit_fn in sub_db_mapping_fn.items():
    mapping_results(hit_fn, species)

True
##########

This was the query species: puccinia_striiformis-tritici

These are the results:
tname
puccinia_striiformis-tritici    0.998531
pyrenophora_tritici-repentis    0.000467
zymoseptoria_tritici            0.000134
penicillium_chrysogenum         0.000134
cortinarius_globuliformis       0.000134
clavispora_lusitaniae           0.000134
aspergillus_niger               0.000134
wickerhamomyces_anomalus        0.000067
tuber_brumale                   0.000067
scedosporium_boydii             0.000067
rhodotorula_mucilaginosa        0.000067
meyerozyma_guilliermondii       0.000067
dtype: float64 

True
##########

This was the query species: zymoseptoria_tritici

These are the results:
tname
zymoseptoria_tritici             0.996068
pyrenophora_tritici-repentis     0.001266
penicillium_chrysogenum          0.000533
tuber_brumale                    0.000267
fusarium_oxysporum               0.000200
puccinia_striiformis-tritici     0.000133
kluyveromyces_unidentified       0.0001

True
##########

This was the query species: kluyveromyces_unidentified

These are the results:
tname
kluyveromyces_unidentified       0.488380
kluyveromyces_marxianus          0.425123
cryptococcus_zero                0.059735
cortinarius_globuliformis        0.004610
rhodotorula_mucilaginosa         0.002625
entoleuca_unidentified           0.002049
quambalaria_cyanescens           0.001473
meyerozyma_guilliermondii        0.001409
candida_unidentified             0.001345
saccharomyces_cerevisiae         0.001280
wickerhamomyces_anomalus         0.000960
yarrowia_lipolytica              0.000960
oculimacula_yallundae-ccl029     0.000960
candida_metapsilosis             0.000768
candida_orthopsilosis            0.000704
clavispora_lusitaniae            0.000704
yamadazyma_mexicana              0.000704
zymoseptoria_tritici             0.000576
candida_albicans                 0.000512
debaryomyces_unidentified        0.000512
candida_parapsilosis             0.000384
penicillium_chry

True
##########

This was the query species: meyerozyma_guilliermondii

These are the results:
tname
meyerozyma_guilliermondii       0.994081
debaryomyces_unidentified       0.000931
yamadazyma_mexicana             0.000665
yamadazyma_scolyti              0.000532
candida_orthopsilosis           0.000399
wickerhamomyces_anomalus        0.000333
candida_metapsilosis            0.000333
zygoascus_hellenicus            0.000266
candida_albicans                0.000266
candida_unidentified            0.000266
kodamaea_ohmeri                 0.000266
kluyveromyces_unidentified      0.000200
candida_parapsilosis            0.000200
scedosporium_boydii             0.000200
pichia_kudriavzevii             0.000133
yarrowia_lipolytica             0.000133
kluyveromyces_marxianus         0.000133
galactomyces_geotrichum         0.000133
aspergillus_flavus              0.000133
puccinia_striiformis-tritici    0.000067
pyrenophora_tritici-repentis    0.000067
rhodotorula_mucilaginosa        0.0000

True
##########

This was the query species: oculimacula_yallundae-ccl031

These are the results:
tname
oculimacula_yallundae-ccl031     0.963964
oculimacula_yallundae-ccl029     0.027756
quambalaria_cyanescens           0.000795
entoleuca_unidentified           0.000530
dothiorella_vidmadera            0.000530
asteroma_ccl060                  0.000530
yamadazyma_mexicana              0.000464
tuber_brumale                    0.000464
pyrenophora_tritici-repentis     0.000397
asteroma_ccl068                  0.000331
clavispora_lusitaniae            0.000331
zygoascus_hellenicus             0.000331
fusarium_oxysporum               0.000265
cladophialophora_unidentified    0.000265
cryptococcus_zero                0.000265
diaporthe_unidentified           0.000199
diaporthe_ccl067                 0.000199
aspergillus_niger                0.000199
scedosporium_boydii              0.000199
yarrowia_lipolytica              0.000199
galactomyces_geotrichum          0.000132
aspergillus_un

True
##########

This was the query species: cladophialophora_unidentified

These are the results:
tname
cladophialophora_unidentified    0.997734
zymoseptoria_tritici             0.000267
candida_orthopsilosis            0.000267
candida_albicans                 0.000200
kluyveromyces_unidentified       0.000200
fusarium_oxysporum               0.000133
candida_metapsilosis             0.000133
clavispora_lusitaniae            0.000133
candida_parapsilosis             0.000067
candida_unidentified             0.000067
entoleuca_unidentified           0.000067
galactomyces_geotrichum          0.000067
wickerhamomyces_anomalus         0.000067
oculimacula_yallundae-ccl029     0.000067
oculimacula_yallundae-ccl031     0.000067
penicillium_chrysogenum          0.000067
pichia_membranifaciens           0.000067
quambalaria_cyanescens           0.000067
rhodotorula_mucilaginosa         0.000067
saccharomyces_cerevisiae         0.000067
tuber_brumale                    0.000067
asteroma_ccl0

True
##########

This was the query species: kluyveromyces_marxianus

These are the results:
tname
kluyveromyces_marxianus          0.740035
kluyveromyces_unidentified       0.249117
candida_metapsilosis             0.000963
oculimacula_yallundae-ccl029     0.000963
entoleuca_unidentified           0.000706
wickerhamomyces_anomalus         0.000706
yamadazyma_scolyti               0.000578
clavispora_lusitaniae            0.000514
debaryomyces_unidentified        0.000514
saccharomyces_cerevisiae         0.000514
meyerozyma_guilliermondii        0.000514
yamadazyma_mexicana              0.000449
candida_orthopsilosis            0.000449
candida_albicans                 0.000321
yarrowia_lipolytica              0.000321
candida_unidentified             0.000321
candida_parapsilosis             0.000257
tuber_brumale                    0.000257
oculimacula_yallundae-ccl031     0.000257
pichia_kudriavzevii              0.000257
galactomyces_geotrichum          0.000193
rhodotorula_mucilag

### Pull in mapping results and analyse them at all available levels

##### idea

* pull in query taxid as a dictionary
* assign taxid for each tname species from minimap2
* generate a summary dictionary that checks concordance at each taxonmic rank

In [145]:
def pull_mapping_results(fn):
    """
    Takes a minimap2 paf and reads it in with the first 12 columns. Ignores the rest.
    Filters for each read the best hit on mquality first taking the highest value.
    Filters for each read by the number of nmatches in the second step.
    Returns a dataframe that has the tnames as index and the counts of hits as column 'count'.
    The dataframe has also the taxrank columns ['k', 'p', 'c', 'o', 'f', 'g', 's'] that are all False to start with.
    """
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    sub_df = tmp_df[tmp_df['mquality'] == tmp_df.groupby('qseqid')['mquality'].transform(max)].reset_index(drop=True)
    sub_df = sub_df[sub_df['nmatch'] == sub_df.groupby('qseqid')['nmatch'].transform(max)].reset_index(drop=True)
    hit_df = pd.DataFrame(sub_df.groupby('tname')['mquality'].count().tolist(), sub_df.groupby('tname')['mquality'].count().index, columns=['count'])
    hit_df.sort_values(by='count', ascending=False, inplace=True)
    for key in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        hit_df[key] = False
    return hit_df

In [146]:
def getquery_taxfileid(refdf_fn, species):
    """
    Takes the reference dataframe filename and the species name.
    Returns the taxfileid, which is the date/flowcellid (column 0 value) of the ref_df.
    """
    ref_df = pd.read_csv(refdf_fn)
    ref_df['name_species'] = ref_df['genus'] +"_"+ ref_df['species']
    return ref_df[ref_df.name_species == species].iloc[:,0].values[0]

In [147]:
def get_taxid_dict(taxid_fn, taxfileid):
    """
    Takes a taxonomy assignment file filename in the Qiime format and a taxonomic identifier.
    Returns the a dictionary with the taxonomic assignment at each rank.
    """
    tax_dict = {}
    with open(taxid_fn, 'r') as fh:
        for line in fh:
            if line.startswith(taxfileid):
                taxrankids = line.rstrip().split('\t')[1].split(';')
                for taxrank in taxrankids:
                    tax_dict[taxrank.split('__')[0]] = taxrank.split('__')[1]
    return tax_dict

In [148]:
def assign_taxranks_results(mapping_df, tax_fn, ref_df_fn = False):
    """
    This function assigns the taxonomic ranks for each hit in the mapping results dataframe.
    It takes a mapping_df, taxonomy assignment file, and if required a reference dataframe filename.
    Returns the mapping dataframe with assignment. 
    """
    for tname in mapping_df.index:
        if ref_df_fn:
            tmp_taxfileid = getquery_taxfileid(ref_df_fn, tname)
        else:
            tmp_taxfileid = tname
        tmp_tax_dict = get_taxid_dict(tax_fn, tmp_taxfileid)
        for key, value in tmp_tax_dict.items():
            mapping_df.loc[tname, key] = value
    return mapping_df

In [149]:
def get_accuracy_dict(mapping_df, query_tax_dict):
    """
    Summarieses the mapping accuracy of the mapping results at all taxonomic ranks.
    Takes the mapping_df with taxnomonic assignments and a taxnomic dictionary of the known query.
    Returns an accuracy dictionary for each taxnomic rank ['k', 'p', 'c', 'o', 'f', 'g', 's']. 
    Right now this function takes a qiime tax 
    """
    accuracy_dict = {}
    total_count = mapping_df['count'].sum()
    for tax_rank in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        hit_count = mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]]['count'].sum()
        accuracy_dict[tax_rank] = hit_count/total_count
    return accuracy_dict

In [150]:
###Test out the summary results statistic for a single mapping result
species = 'penicillium_chrysogenum'
mapping_results = pull_mapping_results(sub_db_mapping_fn[species])

In [151]:
###Assign the data taxonomics ranks for all the results
mapping_results = assign_taxranks_results(mapping_results, taxonomy_file_fn, ref_df_fn=reference_dataframe_fn)

taxfileid = getquery_taxfileid(reference_dataframe_fn, species)

query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)

sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)

In [152]:
sensitivity_dict

{'k': 1.0,
 'p': 0.9992679355783309,
 'c': 0.9954079595368028,
 'o': 0.9952083056036204,
 'f': 0.9952083056036204,
 'g': 0.9915479834952748,
 's': 0.9915479834952748}

In [153]:
###Test out the summary results statistic for a single mapping result
species = 'candida_albicans'
mapping_results = pull_mapping_results(sub_db_mapping_fn[species])

In [154]:
###Assign the data taxonomics ranks for all the results
mapping_results = assign_taxranks_results(mapping_results, taxonomy_file_fn, ref_df_fn=reference_dataframe_fn)

taxfileid = getquery_taxfileid(reference_dataframe_fn, species)

query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)

sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)

In [155]:
sensitivity_dict

{'k': 1.0,
 'p': 0.9995496364923117,
 'c': 0.995560702567072,
 'o': 0.995560702567072,
 'f': 0.9855240301100173,
 'g': 0.9855240301100173,
 's': 0.49501383259345044}

In [156]:
###Test out the summary results statistic for a single mapping result
species = 'aspergillus_niger'
mapping_results = pull_mapping_results(sub_db_mapping_fn[species])

In [157]:
###Assign the data taxonomics ranks for all the results
mapping_results = assign_taxranks_results(mapping_results, taxonomy_file_fn, ref_df_fn=reference_dataframe_fn)

taxfileid = getquery_taxfileid(reference_dataframe_fn, species)

query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)

sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)

In [158]:
sensitivity_dict

{'k': 1.0,
 'p': 0.9993412384716732,
 'c': 0.9960474308300395,
 'o': 0.9957839262187088,
 'f': 0.9957839262187088,
 'g': 0.9946640316205534,
 's': 0.8432147562582345}

### Test run on the qiime2 Database

##### Prep on the command line

cp sh_refs_qiime_ver8_dynamic_02.02.2019.fasta /media/WorkingStorage/ben.working/students/tavish/analysis/qiime2/db/.  
cp sh_taxonomy_qiime_ver8_dynamic_02.02.2019.txt /media/WorkingStorage/ben.working/students/tavish/analysis/qiime2/db/.


In [159]:
def pull_mapping_results_v2(fn):
    """
    Takes a minimap2 paf and reads it in with the first 12 columns. Ignores the rest.
    Filters for each read the best hit on mquality first taking the highest value.
    Filters for each read by the number of nmatches in the second step.
    Returns a dataframe that has the tnames as index and the counts of hits as column 'count'.
    The dataframe has also the taxrank columns ['k', 'p', 'c', 'o', 'f', 'g', 's'] that are all False to start with.
    """
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    sub_df = tmp_df[tmp_df['mquality'] == tmp_df.groupby('qseqid')['mquality'].transform(max)].reset_index(drop=True)
    #sub_df = sub_df[sub_df['nmatch'] == sub_df.groupby('qseqid')['nmatch'].transform(max)].reset_index(drop=True)
    hit_df = pd.DataFrame(sub_df.groupby('tname')['mquality'].count().tolist(), sub_df.groupby('tname')['mquality'].count().index, columns=['count'])
    hit_df.sort_values(by='count', ascending=False, inplace=True)
    for key in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        hit_df[key] = False
        tmp_df[key] = False
    return hit_df, tmp_df

In [160]:
os.path.abspath(os.curdir)

'/media/MassStorage/tmp/TE/honours/scripts/Notebooks'

In [161]:
qiime_db_fn = os.path.abspath('../../analysis/qiime2/db/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta')
qiime_tax_fn = os.path.abspath('../../analysis/qiime2/db/sh_taxonomy_qiime_ver8_dynamic_02.02.2019.txt')
threads = 10
QIIME_DIR = os.path.abspath('../../analysis/qiime2/')

In [162]:
##mapping folder
mapping_dir = os.path.join(QIIME_DIR, os.path.basename(qiime_db_fn).replace('.fasta', '').replace('.','_'))
if not os.path.exists(mapping_dir):
    os.mkdir(mapping_dir)
subsampling_dir = os.path.join(QIIME_DIR, 'subsamplereads')
if not os.path.exists(subsampling_dir):
    os.mkdir(subsampling_dir)

#### Run on test species 'penicillium_chrysogenum'

In [163]:
#subsample tests species
fn_subsampling = {}
test_species = ['penicillium_chrysogenum']
for x in test_species:
    fn_subsampling[x] = (ref_df[(ref_df['species'] == x.split('_')[1]) & (ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])

sub_reads_fn = {}
n_reads = 20000
for key, value in fn_subsampling.items():
    species = key
    in_fn = value
    out_fn = os.path.join(subsampling_dir, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn

###Map the reads
db_fn = qiime_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(mapping_dir, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

###Test out the summary results statistic for a single mapping result
species = 'penicillium_chrysogenum'
mapping_results , full_results_df = pull_mapping_results_v2(sub_db_mapping_fn[species])
mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
taxfileid = getquery_taxfileid(reference_dataframe_fn, species)
query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)
###fix family level for 'penicillium_chrysogenum'
sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)


full_results_df.index = full_results_df.tname
###Also look at the full results dataframe to explore results a bit more
for tname in full_results_df.tname.unique():

    tmp_tax_dict = get_taxid_dict(qiime_tax_fn, tname)
    for key, value in tmp_tax_dict.items():
        full_results_df.loc[tname, key] = value

:(check one reformat.sh samplereadstarget=20000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode10/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/qiime2/subsamplereads/penicillium_chrysogenum.20000.fasta!!

:)Completed minimap2 -x map-ont -t 10 /media/MassStorage/tmp/TE/honours/analysis/qiime2/db/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta /media/MassStorage/tmp/TE/honours/analysis/qiime2/subsamplereads/penicillium_chrysogenum.20000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/qiime2/sh_refs_qiime_ver8_dynamic_02_02_2019/sh_refs_qiime_ver8_dynamic_02.02.2019.penicillium_chrysogenum.minimap2.paf



In [164]:
sensitivity_dict 

{'k': 1.0,
 'p': 0.8811805461825708,
 'c': 0.1900391415845593,
 'o': 0.18913933504296576,
 'f': 0.1882395285013722,
 'g': 0.08134251136005759,
 's': 0.0038691681288522965}

In [165]:
full_results_df

Unnamed: 0_level_0,qseqid,qlen,qstart,qstop,strand,tname,tlen,tstart,tend,nmatch,alen,mquality,k,p,c,o,f,g,s
tname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
SH1732842.08FU_FJ430779_refs_singleton,4d0876bf-d131-46b6-8075-14de888dded8,2739,81,985,+,SH1732842.08FU_FJ430779_refs_singleton,1661,500,1471,210,974,35,Fungi,Ascomycota,Leotiomycetes,Helotiales,Helotiaceae,Acidea,Acidea_extrema
SH1654757.08FU_UDB014954_reps,4d0876bf-d131-46b6-8075-14de888dded8,2739,1679,2520,+,SH1654757.08FU_UDB014954_reps,1390,493,1356,60,864,13,Fungi,Ascomycota,Orbiliomycetes,Orbiliales,unidentified,unidentified,unidentified
SH2190001.08FU_AY371635_refs,4d0876bf-d131-46b6-8075-14de888dded8,2739,1223,1666,+,SH2190001.08FU_AY371635_refs,516,32,508,51,476,0,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_aethiopicum
SH1560162.08FU_AY225488_reps,4d0876bf-d131-46b6-8075-14de888dded8,2739,1696,2475,+,SH1560162.08FU_AY225488_reps,1828,587,1376,45,789,0,Fungi,Ascomycota,Sordariomycetes,Microascales,Halosphaeriaceae,Lignincola,Lignincola_laevis
SH1900984.08FU_AF263347_refs_singleton,4d0876bf-d131-46b6-8075-14de888dded8,2739,1496,1648,+,SH1900984.08FU_AF263347_refs_singleton,507,332,500,44,168,0,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_bovifimosum
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SH2189929.08FU_NR_103692_refs,f15f2f35-a853-4012-b6cc-aac9c680cb50,3007,1341,1736,+,SH2189929.08FU_NR_103692_refs,495,32,424,141,400,0,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_griseofulvum
SH2190041.08FU_KM973207_refs,f15f2f35-a853-4012-b6cc-aac9c680cb50,3007,1341,1736,+,SH2190041.08FU_KM973207_refs,495,32,424,141,400,0,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_compactum
SH1649638.08FU_UDB016769_refs,f15f2f35-a853-4012-b6cc-aac9c680cb50,3007,2624,2874,+,SH1649638.08FU_UDB016769_refs,1720,1429,1681,101,256,60,Fungi,Basidiomycota,Agaricomycetes,Thelephorales,Thelephoraceae,Tomentella,unidentified
SH1692805.08FU_JN617705_refs,f15f2f35-a853-4012-b6cc-aac9c680cb50,3007,1660,1819,+,SH1692805.08FU_JN617705_refs,510,345,506,49,161,0,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_rolfsii


In [166]:
###looking at unfiltered results
###look at the results unfiltered
full_results_df['count'] = 1

get_accuracy_dict(full_results_df, query_tax_dict)

{'k': 1.0,
 'p': 0.9115974116524569,
 'c': 0.4380680358650047,
 'o': 0.432655258039657,
 'f': 0.43007424985627085,
 'g': 0.38932857090433143,
 's': 0.030183117026091423}

##### These are wired results that might be linked to

* database issues as you can see

sh_taxonomy_qiime_ver8_dynamic_02.02.2019.txt: k__Fungi;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Penicillium;s__Penicillium_chrysogenum

In [167]:
mapping_results[mapping_results['s'] == 'Penicillium_chrysogenum']

Unnamed: 0_level_0,count,k,p,c,o,f,g,s
tname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SH1530047.08FU_HE649392_reps,52,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_chrysogenum
SH2189908.08FU_AY213669_refs,34,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_chrysogenum


In [168]:
query_tax_dict = {}
taxrank = 'k__Fungi;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Penicillium;s__Penicillium_chrysogenum'
for rank_id in taxrank.split(';'):
    query_tax_dict[rank_id.split('__')[0]] = rank_id.split('__')[1]
query_tax_dict

{'k': 'Fungi',
 'p': 'Ascomycota',
 'c': 'Eurotiomycetes',
 'o': 'Eurotiales',
 'f': 'Aspergillaceae',
 'g': 'Penicillium',
 's': 'Penicillium_chrysogenum'}

In [169]:
get_accuracy_dict(mapping_results, query_tax_dict)

{'k': 1.0,
 'p': 0.8811805461825708,
 'c': 0.1900391415845593,
 'o': 0.18913933504296576,
 'f': 0.1882395285013722,
 'g': 0.08134251136005759,
 's': 0.0038691681288522965}

In [170]:
###There must be more or equal number of mapping results compared to number of mapped reads
mapping_results['count'].sum() >= full_results_df.qseqid.unique().shape[0]

True

#### Testing on Candida albicans

In [171]:
#subsample tests species
fn_subsampling = {}
test_species = ['candida_albicans']
for x in test_species:
    fn_subsampling[x] = (ref_df[(ref_df['species'] == x.split('_')[1]) & (ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])

sub_reads_fn = {}
n_reads = 20000
for key, value in fn_subsampling.items():
    species = key
    in_fn = value
    out_fn = os.path.join(subsampling_dir, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn

:(check one reformat.sh samplereadstarget=20000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20180108_FAH18647/barcode03/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/qiime2/subsamplereads/candida_albicans.20000.fasta!!



In [172]:
###Map the reads
db_fn = qiime_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(mapping_dir, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 10 /media/MassStorage/tmp/TE/honours/analysis/qiime2/db/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta /media/MassStorage/tmp/TE/honours/analysis/qiime2/subsamplereads/candida_albicans.20000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/qiime2/sh_refs_qiime_ver8_dynamic_02_02_2019/sh_refs_qiime_ver8_dynamic_02.02.2019.candida_albicans.minimap2.paf



In [173]:
###Test out the summary results statistic for a single mapping result
species = test_species[0]
print(sub_db_mapping_fn[species])
mapping_results , full_results_df = pull_mapping_results_v2(sub_db_mapping_fn[species])
mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
taxfileid = getquery_taxfileid(reference_dataframe_fn, species)
print(taxfileid)
query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)
print(query_tax_dict)

sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)

full_results_df.index = full_results_df.tname
###Also look at the full results dataframe to explore results a bit more
for tname in full_results_df.tname.unique():

    tmp_tax_dict = get_taxid_dict(qiime_tax_fn, tname)
    for key, value in tmp_tax_dict.items():
        full_results_df.loc[tname, key] = value

/media/MassStorage/tmp/TE/honours/analysis/qiime2/sh_refs_qiime_ver8_dynamic_02_02_2019/sh_refs_qiime_ver8_dynamic_02.02.2019.candida_albicans.minimap2.paf
20180108_FAH18647/barcode03
{'k': 'Fungi', 'p': 'Ascomycota', 'c': 'Saccharomycetes', 'o': 'Saccharomycetales', 'f': 'Saccharomycetales_fam_Incertae_sedis', 'g': 'Candida', 's': 'Candida_albicans'}


In [174]:
sensitivity_dict

{'k': 1.0,
 'p': 0.8175055999323781,
 'c': 0.5192088246481552,
 'o': 0.5192088246481552,
 'f': 0.4455855627403745,
 'g': 0.4455855627403745,
 's': 0.4186636236845442}

In [175]:
def pull_mapping_results_v3(fn):
    """
    Takes a minimap2 paf and reads it in with the first 12 columns. Ignores the rest.
    Filters for each read the best hit on mquality first taking the highest value.
    Filters for each read by the number of nmatches in the second step.
    Returns a dataframe that has the tnames as index and the counts of hits as column 'count'.
    The dataframe has also the taxrank columns ['k', 'p', 'c', 'o', 'f', 'g', 's'] that are all False to start with.
    """
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
#     sub_df = sub_df[sub_df['nmatch'] == sub_df.groupby('qseqid')['nmatch'].transform(max)].reset_index(drop=True)
    hit_df = pd.DataFrame(sub_df.groupby('tname')['cscore'].count().tolist(), sub_df.groupby('tname')['cscore'].count().index, columns=['count'])
    hit_df.sort_values(by='count', ascending=False, inplace=True)
    for key in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        hit_df[key] = False
        tmp_df[key] = False
    return hit_df, tmp_df

In [176]:
os.path.abspath(os.curdir)

'/media/MassStorage/tmp/TE/honours/scripts/Notebooks'

In [177]:
qiime_db_fn = os.path.abspath('../../analysis/qiime2/db/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta')
qiime_tax_fn = os.path.abspath('../../analysis/qiime2/db/sh_taxonomy_qiime_ver8_dynamic_02.02.2019.txt')
threads = 10
QIIME_DIR = os.path.abspath('../../analysis/qiime2/')

In [178]:
##mapping folder
mapping_dir = os.path.join(QIIME_DIR, os.path.basename(qiime_db_fn).replace('.fasta', '').replace('.','_'))
if not os.path.exists(mapping_dir):
    os.mkdir(mapping_dir)
subsampling_dir = os.path.join(QIIME_DIR, 'subsamplereads')
if not os.path.exists(subsampling_dir):
    os.mkdir(subsampling_dir)

#### Run on test species 'penicillium_chrysogenum'

In [179]:
#subsample tests species
fn_subsampling = {}
test_species = ['penicillium_chrysogenum']
for x in test_species:
    fn_subsampling[x] = (ref_df[(ref_df['species'] == x.split('_')[1]) & (ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])

sub_reads_fn = {}
n_reads = 20000
for key, value in fn_subsampling.items():
    species = key
    in_fn = value
    out_fn = os.path.join(subsampling_dir, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn

###Map the reads
db_fn = qiime_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(mapping_dir, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

###Test out the summary results statistic for a single mapping result
species = 'penicillium_chrysogenum'
mapping_results , full_results_df = pull_mapping_results_v3(sub_db_mapping_fn[species])
mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
taxfileid = getquery_taxfileid(reference_dataframe_fn, species)
query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)
###fix family level for 'penicillium_chrysogenum'
sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)


full_results_df.index = full_results_df.tname
###Also look at the full results dataframe to explore results a bit more
for tname in full_results_df.tname.unique():

    tmp_tax_dict = get_taxid_dict(qiime_tax_fn, tname)
    for key, value in tmp_tax_dict.items():
        full_results_df.loc[tname, key] = value

:(check one reformat.sh samplereadstarget=20000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode10/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/qiime2/subsamplereads/penicillium_chrysogenum.20000.fasta!!

:)Completed minimap2 -x map-ont -t 10 /media/MassStorage/tmp/TE/honours/analysis/qiime2/db/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta /media/MassStorage/tmp/TE/honours/analysis/qiime2/subsamplereads/penicillium_chrysogenum.20000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/qiime2/sh_refs_qiime_ver8_dynamic_02_02_2019/sh_refs_qiime_ver8_dynamic_02.02.2019.penicillium_chrysogenum.minimap2.paf



In [180]:
sensitivity_dict 

{'k': 1.0,
 'p': 0.9551197165455781,
 'c': 0.6736337282130203,
 'o': 0.6466482946208081,
 'f': 0.6438566980423034,
 'g': 0.5488350452739702,
 's': 0.043699223363515984}

In [181]:
full_results_df

Unnamed: 0_level_0,qseqid,qlen,qstart,qstop,strand,tname,tlen,tstart,tend,nmatch,alen,mquality,cscore,k,p,c,o,f,g,s
tname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
SH1732842.08FU_FJ430779_refs_singleton,4d0876bf-d131-46b6-8075-14de888dded8,2739,81,985,+,SH1732842.08FU_FJ430779_refs_singleton,1661,500,1471,210,974,35,1.274869,Fungi,Ascomycota,Leotiomycetes,Helotiales,Helotiaceae,Acidea,Acidea_extrema
SH1654757.08FU_UDB014954_reps,4d0876bf-d131-46b6-8075-14de888dded8,2739,1679,2520,+,SH1654757.08FU_UDB014954_reps,1390,493,1356,60,864,13,1.074627,Fungi,Ascomycota,Orbiliomycetes,Orbiliales,unidentified,unidentified,unidentified
SH2190001.08FU_AY371635_refs,4d0876bf-d131-46b6-8075-14de888dded8,2739,1223,1666,+,SH2190001.08FU_AY371635_refs,516,32,508,51,476,0,1.120000,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_aethiopicum
SH1560162.08FU_AY225488_reps,4d0876bf-d131-46b6-8075-14de888dded8,2739,1696,2475,+,SH1560162.08FU_AY225488_reps,1828,587,1376,45,789,0,1.060484,Fungi,Ascomycota,Sordariomycetes,Microascales,Halosphaeriaceae,Lignincola,Lignincola_laevis
SH1900984.08FU_AF263347_refs_singleton,4d0876bf-d131-46b6-8075-14de888dded8,2739,1496,1648,+,SH1900984.08FU_AF263347_refs_singleton,507,332,500,44,168,0,1.354839,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_bovifimosum
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SH2189929.08FU_NR_103692_refs,f15f2f35-a853-4012-b6cc-aac9c680cb50,3007,1341,1736,+,SH2189929.08FU_NR_103692_refs,495,32,424,141,400,0,1.544402,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_griseofulvum
SH2190041.08FU_KM973207_refs,f15f2f35-a853-4012-b6cc-aac9c680cb50,3007,1341,1736,+,SH2190041.08FU_KM973207_refs,495,32,424,141,400,0,1.544402,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_compactum
SH1649638.08FU_UDB016769_refs,f15f2f35-a853-4012-b6cc-aac9c680cb50,3007,2624,2874,+,SH1649638.08FU_UDB016769_refs,1720,1429,1681,101,256,60,1.651613,Fungi,Basidiomycota,Agaricomycetes,Thelephorales,Thelephoraceae,Tomentella,unidentified
SH1692805.08FU_JN617705_refs,f15f2f35-a853-4012-b6cc-aac9c680cb50,3007,1660,1819,+,SH1692805.08FU_JN617705_refs,510,345,506,49,161,0,1.437500,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_rolfsii


In [182]:
###looking at unfiltered results
###look at the results unfiltered
full_results_df['count'] = 1

get_accuracy_dict(full_results_df, query_tax_dict)

{'k': 1.0,
 'p': 0.9115974116524569,
 'c': 0.4380680358650047,
 'o': 0.432655258039657,
 'f': 0.43007424985627085,
 'g': 0.38932857090433143,
 's': 0.030183117026091423}

##### These are wired results that might be linked to

* database issues as you can see

sh_taxonomy_qiime_ver8_dynamic_02.02.2019.txt: k__Fungi;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Penicillium;s__Penicillium_chrysogenum

In [183]:
mapping_results[mapping_results['s'] == 'Penicillium_chrysogenum']

Unnamed: 0_level_0,count,k,p,c,o,f,g,s
tname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
SH2189908.08FU_AY213669_refs,844,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_chrysogenum
SH1530047.08FU_HE649392_reps,377,Fungi,Ascomycota,Eurotiomycetes,Eurotiales,Aspergillaceae,Penicillium,Penicillium_chrysogenum


In [184]:
query_tax_dict = {}
taxrank = 'k__Fungi;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Penicillium;s__Penicillium_chrysogenum'
for rank_id in taxrank.split(';'):
    query_tax_dict[rank_id.split('__')[0]] = rank_id.split('__')[1]
query_tax_dict

{'k': 'Fungi',
 'p': 'Ascomycota',
 'c': 'Eurotiomycetes',
 'o': 'Eurotiales',
 'f': 'Aspergillaceae',
 'g': 'Penicillium',
 's': 'Penicillium_chrysogenum'}

In [185]:
get_accuracy_dict(mapping_results, query_tax_dict)

{'k': 1.0,
 'p': 0.9551197165455781,
 'c': 0.6736337282130203,
 'o': 0.6466482946208081,
 'f': 0.6438566980423034,
 'g': 0.5488350452739702,
 's': 0.043699223363515984}

In [186]:
###There must be more or equal number of mapping results compared to number of mapped reads
mapping_results['count'].sum() >= full_results_df.qseqid.unique().shape[0]

True

#### Testing on Candida albicans

In [187]:
#subsample tests species
fn_subsampling = {}
test_species = ['candida_albicans']
for x in test_species:
    fn_subsampling[x] = (ref_df[(ref_df['species'] == x.split('_')[1]) & (ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])

sub_reads_fn = {}
n_reads = 20000
for key, value in fn_subsampling.items():
    species = key
    in_fn = value
    out_fn = os.path.join(subsampling_dir, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn

:(check one reformat.sh samplereadstarget=20000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20180108_FAH18647/barcode03/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/qiime2/subsamplereads/candida_albicans.20000.fasta!!



In [188]:
###Map the reads
db_fn = qiime_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(mapping_dir, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 10 /media/MassStorage/tmp/TE/honours/analysis/qiime2/db/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta /media/MassStorage/tmp/TE/honours/analysis/qiime2/subsamplereads/candida_albicans.20000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/qiime2/sh_refs_qiime_ver8_dynamic_02_02_2019/sh_refs_qiime_ver8_dynamic_02.02.2019.candida_albicans.minimap2.paf



In [189]:
###Test out the summary results statistic for a single mapping result
species = test_species[0]
print(sub_db_mapping_fn[species])
mapping_results , full_results_df = pull_mapping_results_v3(sub_db_mapping_fn[species])
mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
taxfileid = getquery_taxfileid(reference_dataframe_fn, species)
print(taxfileid)
query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)
print(query_tax_dict)

sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)

full_results_df.index = full_results_df.tname
###Also look at the full results dataframe to explore results a bit more
for tname in full_results_df.tname.unique():

    tmp_tax_dict = get_taxid_dict(qiime_tax_fn, tname)
    for key, value in tmp_tax_dict.items():
        full_results_df.loc[tname, key] = value

/media/MassStorage/tmp/TE/honours/analysis/qiime2/sh_refs_qiime_ver8_dynamic_02_02_2019/sh_refs_qiime_ver8_dynamic_02.02.2019.candida_albicans.minimap2.paf
20180108_FAH18647/barcode03
{'k': 'Fungi', 'p': 'Ascomycota', 'c': 'Saccharomycetes', 'o': 'Saccharomycetales', 'f': 'Saccharomycetales_fam_Incertae_sedis', 'g': 'Candida', 's': 'Candida_albicans'}


In [190]:
sensitivity_dict

{'k': 1.0,
 'p': 0.9333918831468984,
 'c': 0.5988259221595637,
 'o': 0.5988259221595637,
 'f': 0.3706203198668762,
 'g': 0.3706203198668762,
 's': 0.30248682629194784}

#### Might want to double check how your families are matched in your taxonomic input file for your known test species

In [191]:
# mapping_results[mapping_results['s'] == 'Candida_albicans']

In [192]:
# query_tax_dict['f'] = 'Saccharomycetales_fam_Incertae_sedis'

In [193]:
# get_accuracy_dict(mapping_results, query_tax_dict)

### It appears that the filtering of results by mapping quality works well for the long ITS database but not for the qiime

In [194]:
full_results_df.columns

Index(['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen',
       'tstart', 'tend', 'nmatch', 'alen', 'mquality', 'cscore', 'k', 'p', 'c',
       'o', 'f', 'g', 's'],
      dtype='object')

In [195]:
full_results_df.tname.shape

(133529,)

In [196]:
full_results_df.tname.unique().shape

(166,)

In [197]:
full_results_df.index = full_results_df.tname

In [198]:
###Asign taxonomic ranks to the full_results_df
for tname in full_results_df.tname.unique():
    tmp_tax_dict = get_taxid_dict(qiime_tax_fn, tname)
    for key, value in tmp_tax_dict.items():
        full_results_df.loc[tname, key] = value

In [199]:
full_results_df[full_results_df['g'] == 'Candida']['qseqid'].shape

(28992,)

In [200]:
full_results_df[full_results_df['g'] == 'Candida']['qseqid'].unique().shape

(18016,)

In [201]:
###look at the results unfiltered
full_results_df['count'] = 1

get_accuracy_dict(full_results_df, query_tax_dict)

{'k': 1.0,
 'p': 0.8167813733346314,
 'c': 0.2981150162137064,
 'o': 0.2981150162137064,
 'f': 0.2171213743830928,
 'g': 0.2171213743830928,
 's': 0.17868028667929814}

Looking at the results unfiltered doesn't really work very well either. Might need to look into different filtering of the alignments or the qiime2 database might be just not really useful for the noise reads. Simulated reads with higher accuracy should get better here.

In [202]:
full_results_df.groupby('g').count()['k'].sort_values(ascending=False)

g
Candida                28992
Acidea                 16877
Plectosphaerella       12234
unidentified           10983
Nakaseomyces           10297
Oidiodendron            9564
Tomentella              8986
Knoxdaviesia            7567
Lignincola              6155
Lacrymaria              5027
Cystoagaricus           4941
Bannoa                  4640
Pseudorobillarda        2835
Deltopyxis               876
Pestalotiopsis           712
Leveillula               668
Wickerhamiella           424
Orpinomyces              384
Sporisorium              146
Phyllactinia             134
Exophiala                132
Pyrenodesmia             131
Physcia                  126
Physconia                123
Rhinocladiella           111
Pseudophaeomoniella      105
Sclerophora              104
Conioscypha               40
Chromocleista             39
Fusarium                  31
Rhodotorula               26
Xylaria                   19
Lodderomyces              15
Scheffersomyces           15
Spathaspora 

In [203]:
###There must be more or equal number of mapping results compared to number of mapped reads
mapping_results['count'].sum() >= full_results_df.qseqid.unique().shape[0]

True

#### Testing on other species

In [204]:
#subsample tests species
fn_subsampling = {}
test_species = ['cortinarius_globuliformis']
for x in test_species:
    fn_subsampling[x] = (ref_df[(ref_df['species'] == x.split('_')[1]) & (ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])

sub_reads_fn = {}
n_reads = 20000
for key, value in fn_subsampling.items():
    species = key
    in_fn = value
    out_fn = os.path.join(subsampling_dir, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn

:(check one reformat.sh samplereadstarget=20000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode06/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/qiime2/subsamplereads/cortinarius_globuliformis.20000.fasta!!



In [205]:
###Map the reads
db_fn = qiime_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(mapping_dir, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 10 /media/MassStorage/tmp/TE/honours/analysis/qiime2/db/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta /media/MassStorage/tmp/TE/honours/analysis/qiime2/subsamplereads/cortinarius_globuliformis.20000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/qiime2/sh_refs_qiime_ver8_dynamic_02_02_2019/sh_refs_qiime_ver8_dynamic_02.02.2019.cortinarius_globuliformis.minimap2.paf



In [206]:
###Test out the summary results statistic for a single mapping result
species = test_species[0]
print(sub_db_mapping_fn[species])
mapping_results , full_results_df = pull_mapping_results_v2(sub_db_mapping_fn[species])
mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
taxfileid = getquery_taxfileid(reference_dataframe_fn, species)
print(taxfileid)
query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)
print(query_tax_dict)

sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)

###Also look at the full results dataframe to explore results a bit more
for tname in full_results_df.tname.unique():
    tmp_tax_dict = get_taxid_dict(qiime_tax_fn, tname)
    for key, value in tmp_tax_dict.items():
        full_results_df.loc[tname, key] = value

/media/MassStorage/tmp/TE/honours/analysis/qiime2/sh_refs_qiime_ver8_dynamic_02_02_2019/sh_refs_qiime_ver8_dynamic_02.02.2019.cortinarius_globuliformis.minimap2.paf
20171103_FAH15473/barcode06
{'k': 'Fungi', 'p': 'Basidiomycota', 'c': 'Agaricomycetes', 'o': 'Agaricales', 'f': 'Cortinariaceae', 'g': 'Cortinarius', 's': 'Cortinarius_globuliformis'}


In [207]:
sensitivity_dict

{'k': 1.0,
 'p': 0.8857976113322841,
 'c': 0.5234237570595315,
 'o': 0.4376446625312471,
 'f': 0.1712804369965744,
 'g': 0.16669752800666604,
 's': 0.08679751874826405}

In [208]:
###Test out the summary results statistic for a single mapping result
species = test_species[0]
print(sub_db_mapping_fn[species])
mapping_results , full_results_df = pull_mapping_results_v3(sub_db_mapping_fn[species])
mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
taxfileid = getquery_taxfileid(reference_dataframe_fn, species)
print(taxfileid)
query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)
print(query_tax_dict)

sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)

full_results_df.index = full_results_df.tname
###Also look at the full results dataframe to explore results a bit more
for tname in full_results_df.tname.unique():
    tmp_tax_dict = get_taxid_dict(qiime_tax_fn, tname)
    for key, value in tmp_tax_dict.items():
        full_results_df.loc[tname, key] = value

/media/MassStorage/tmp/TE/honours/analysis/qiime2/sh_refs_qiime_ver8_dynamic_02_02_2019/sh_refs_qiime_ver8_dynamic_02.02.2019.cortinarius_globuliformis.minimap2.paf
20171103_FAH15473/barcode06
{'k': 'Fungi', 'p': 'Basidiomycota', 'c': 'Agaricomycetes', 'o': 'Agaricales', 'f': 'Cortinariaceae', 'g': 'Cortinarius', 's': 'Cortinarius_globuliformis'}


In [209]:
sensitivity_dict

{'k': 1.0,
 'p': 0.9100366018108265,
 'c': 0.8413600462338663,
 'o': 0.7906954344057022,
 'f': 0.4180312078597573,
 'g': 0.4037276054710075,
 's': 0.13441533423232518}

In [210]:
###looking at unfiltered results
###look at the results unfiltered
full_results_df['count'] = 1

get_accuracy_dict(full_results_df, query_tax_dict)

{'k': 1.0,
 'p': 0.8248654952161353,
 'c': 0.7081179931621213,
 'o': 0.6266929050382974,
 'f': 0.41488431263417347,
 'g': 0.40360711351408657,
 's': 0.08720945641514935}

#### Testing on all species using v3

In [211]:
import json
from collections import OrderedDict

def get_accuracy_dict(mapping_df, query_tax_dict):
    """
    Summarieses the mapping accuracy of the mapping results at all taxonomic ranks.
    Takes the mapping_df with taxnomonic assignments and a taxnomic dictionary of the known query.
    Returns an accuracy dictionary for each taxnomic rank ['k', 'p', 'c', 'o', 'f', 'g', 's']. 
    Right now this function takes a qiime tax 
    """
    accuracy_dict = OrderedDict()
    total_count = mapping_df['count'].sum()
    for tax_rank in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        hit_count = mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]]['count'].sum()
        accuracy_dict[tax_rank] = hit_count/total_count
    return accuracy_dict

def minimapmapping(fasta_fn, ref_fn, out_fn, threads):
    command = F"minimap2 -x map-ont -t {threads} {ref_fn} {fasta_fn} -o {out_fn}"
    out = subprocess.getstatusoutput(command)

def pull_mapping_results_v3(fn):
    """
    Takes a minimap2 paf and reads it in with the first 12 columns. Ignores the rest.
    Filters for each read the best hit on mquality first taking the highest value.
    Filters for each read by the number of nmatches in the second step.
    Returns a dataframe that has the tnames as index and the counts of hits as column 'count'.
    The dataframe has also the taxrank columns ['k', 'p', 'c', 'o', 'f', 'g', 's'] that are all False to start with.
    """
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
#     sub_df = sub_df[sub_df['nmatch'] == sub_df.groupby('qseqid')['nmatch'].transform(max)].reset_index(drop=True)
    hit_df = pd.DataFrame(sub_df.groupby('tname')['cscore'].count().tolist(), sub_df.groupby('tname')['cscore'].count().index, columns=['count'])
    hit_df.sort_values(by='count', ascending=False, inplace=True)
    for key in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        hit_df[key] = False
        tmp_df[key] = False
    return hit_df, tmp_df
    
def subsamplereads(in_fn, out_fn, n_reads):
    command = F'reformat.sh samplereadstarget={n_reads} in={in_fn} out={out_fn}'
    out = subprocess.getstatusoutput(command)

test_species_list = []
for entry in ref_df.name_species.tolist():
#     if entry[-7:] != '-ccl031' and entry[-7:] != '-ccl029':
#         test_species_list.append(entry)
#     else:
#         test_species_list.append(entry[:-7])
#         print(entry[:-7])
    test_species_list.append(entry)
    
for test_species in test_species_list:
    
    print(test_species)
    
    #subsample tests species
    fn_subsampling = {}
    test_species = [test_species]
    for x in test_species:
        fn_subsampling[x] = (ref_df[(ref_df['species'] == x.split('_')[1]) & (ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
        fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])

    sub_reads_fn = {}
    n_reads = 20000
    for key, value in fn_subsampling.items():
        species = key
        in_fn = value
        out_fn = os.path.join(subsampling_dir, F'{species}.{n_reads}.fasta')
        subsamplereads(in_fn, out_fn, n_reads)
        sub_reads_fn[species] = out_fn
        
    ###Map the reads
    db_fn = qiime_db_fn
    sub_db_mapping_fn = {}
    for species, fasta_fn in sub_reads_fn.items():
        db_name = os.path.basename(db_fn).replace('.fasta', '')
        out_fn = os.path.join(mapping_dir, F"{db_name}.{species}.minimap2.paf")
        sub_db_mapping_fn[species] = out_fn
        minimapmapping(fasta_fn, db_fn, out_fn, threads)
        
    ###Test out the summary results statistic for a single mapping result
    species = test_species[0]
    mapping_results , full_results_df = pull_mapping_results_v3(sub_db_mapping_fn[species])
    mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
    taxfileid = getquery_taxfileid(reference_dataframe_fn, species)
    query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)

    sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)
            
    print(json.dumps(sensitivity_dict, indent=1))
    print('\n')

puccinia_striiformis-tritici
{
 "k": 1.0,
 "p": 0.9577623888807033,
 "c": 0.8779038357644516,
 "o": 0.8779038357644516,
 "f": 0.8760866362162959,
 "g": 0.8655763469377732,
 "s": 0.8004027307106724
}


zymoseptoria_tritici
{
 "k": 1.0,
 "p": 0.8317647609683008,
 "c": 0.35651074589127685,
 "o": 0.18654305379969097,
 "f": 0.18223533267781056,
 "g": 0.16018167345600973,
 "s": 0.0
}


pyrenophora_tritici-repentis
{
 "k": 1.0,
 "p": 0.8342309619238477,
 "c": 0.6039579158316634,
 "o": 0.5966307615230461,
 "f": 0.3619113226452906,
 "g": 0.311685871743487,
 "s": 0.3032940881763527
}


fusarium_oxysporum
{
 "k": 1.0,
 "p": 0.988622754491018,
 "c": 0.8356661676646706,
 "o": 0.5900074850299402,
 "f": 0.5712200598802395,
 "g": 0.4812125748502994,
 "s": 0.10377994011976048
}


tuber_brumale
{
 "k": 1.0,
 "p": 0.9263061411549037,
 "c": 0.4925756186984418,
 "o": 0.4925756186984418,
 "f": 0.4925297891842346,
 "g": 0.4925297891842346,
 "s": 0.49243813015582033
}


cortinarius_globuliformis
{
 "k": 1.0,


### Test run on the qiime2 Database 99

##### Prep on the command line

cp sh_refs_qiime_ver8_99_02.02.2019.fasta /media/WorkingStorage/ben.working/students/tavish/analysis/qiime2/db/. 
cp sh_taxonomy_qiime_ver8_99_02.02.2019.txt /media/WorkingStorage/ben.working/students/tavish/analysis/qiime2/db/.

In [None]:
qiime_db_fn = os.path.abspath('../../analysis/qiime2/db/sh_refs_qiime_ver8_99_02.02.2019.fasta')
qiime_tax_fn = os.path.abspath('../../analysis/qiime2/db/sh_taxonomy_qiime_ver8_99_02.02.2019.txt')
threads = 10
QIIME_DIR = os.path.abspath('../../analysis/qiime2/')

In [None]:
##mapping folder
mapping_dir = os.path.join(QIIME_DIR, os.path.basename(qiime_db_fn).replace('.fasta', '').replace('.','_'))
if not os.path.exists(mapping_dir):
    os.mkdir(mapping_dir)
subsampling_dir = os.path.join(QIIME_DIR, 'subsamplereads')
if not os.path.exists(subsampling_dir):
    os.mkdir(subsampling_dir)

In [None]:
#subsample tests species
fn_subsampling = {}
test_species = ['penicillium_chrysogenum']
for x in test_species:
    fn_subsampling[x] = (ref_df[(ref_df['species'] == x.split('_')[1]) & (ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])

sub_reads_fn = {}
n_reads = 20000
for key, value in fn_subsampling.items():
    species = key
    in_fn = value
    out_fn = os.path.join(subsampling_dir, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn

###Map the reads
db_fn = qiime_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(mapping_dir, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

###Test out the summary results statistic for a single mapping result
species = 'penicillium_chrysogenum'
mapping_results , full_results_df = pull_mapping_results_v2(sub_db_mapping_fn[species])
mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
taxfileid = getquery_taxfileid(reference_dataframe_fn, species)
query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)
###fix family level for 'penicillium_chrysogenum'
sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)


full_results_df.index = full_results_df.tname
###Also look at the full results dataframe to explore results a bit more
for tname in full_results_df.tname.unique():

    tmp_tax_dict = get_taxid_dict(qiime_tax_fn, tname)
    for key, value in tmp_tax_dict.items():
        full_results_df.loc[tname, key] = value

In [None]:
sensitivity_dict

In [None]:
###looking at unfiltered results
###look at the results unfiltered
full_results_df['count'] = 1

get_accuracy_dict(full_results_df, query_tax_dict)

In [None]:
###fix the query_tax_dict with the data found in the qiime database
query_tax_dict = {}
taxrank = 'k__Fungi;p__Ascomycota;c__Eurotiomycetes;o__Eurotiales;f__Aspergillaceae;g__Penicillium;s__Penicillium_chrysogenum'
for rank_id in taxrank.split(';'):
    query_tax_dict[rank_id.split('__')[0]] = rank_id.split('__')[1]

In [None]:
get_accuracy_dict(mapping_results, query_tax_dict)

Use with Candida albicans

In [None]:
#subsample tests species
fn_subsampling = {}
test_species = ['candida_albicans']
for x in test_species:
    fn_subsampling[x] = (ref_df[(ref_df['species'] == x.split('_')[1]) & (ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])

sub_reads_fn = {}
n_reads = 20000
for key, value in fn_subsampling.items():
    species = key
    in_fn = value
    out_fn = os.path.join(subsampling_dir, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn

###Map the reads
db_fn = qiime_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(mapping_dir, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

###Test out the summary results statistic for a single mapping result
species = 'candida_albicans'
mapping_results , full_results_df = pull_mapping_results_v2(sub_db_mapping_fn[species])
mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
taxfileid = getquery_taxfileid(reference_dataframe_fn, species)
query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)
###fix family level for 'penicillium_chrysogenum'
sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)


full_results_df.index = full_results_df.tname
###Also look at the full results dataframe to explore results a bit more
for tname in full_results_df.tname.unique():

    tmp_tax_dict = get_taxid_dict(qiime_tax_fn, tname)
    for key, value in tmp_tax_dict.items():
        full_results_df.loc[tname, key] = value

In [None]:
sensitivity_dict

In [None]:
###looking at unfiltered results
###look at the results unfiltered
full_results_df['count'] = 1

get_accuracy_dict(full_results_df, query_tax_dict)