### Notebook to analyzse the efficiency of minimap2 mapping for Oxford Nanopore MinION reads against Consensus Database

In [1]:
from Bio import SeqIO
import os
import random
import subprocess
import pandas as pd
import json

In [2]:
reference_dataframe_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/reference_dataframe.csv')
max_custom_database_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/database/custom_database_labelled.fasta')
taxonomy_file_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/taxonomy_file_qiime.csv')

In [3]:
#threads to use
threads = 6

In [4]:
INPUT_BASEDIR = os.path.abspath('/media/MassStorage/tmp/TE/honours')

In [5]:
OUT_DIR = os.path.abspath('../../analysis/Mapping_mock_gsref')
if not os.path.exists(OUT_DIR):
    os.mkdir(OUT_DIR)

In [6]:
### list of species in the max database
max_species = ['Puccinia_striiformis-tritici',
             'Zymoseptoria_tritici',
             'Pyrenophora_tritici-repentis',
             'Fusarium_oxysporum',
             'Tuber_brumale',
             'Cortinarius_globuliformis',
             'Aspergillus_niger',
             'Clavispora_lusitaniae',
             'Kluyveromyces_unidentified',
             'Penicillium_chrysogenum',
             'Rhodotorula_mucilaginosa',
             'Scedosporium_boydii',
             'Blastobotrys_proliferans',
             'Debaryomyces_unidentified',
             'Galactomyces_geotrichum',
             'Kodamaea_ohmeri',
             'Meyerozyma_guilliermondii',
             'Wickerhamomyces_anomalus',
             'Yamadazyma_mexicana',
             'Yamadazyma_scolyti',
             'Yarrowia_lipolytica',
             'Zygoascus_hellenicus',
             'Aspergillus_flavus',
             'Cryptococcus_zero',
             'Aspergillus_unidentified',
             'Diaporthe_CCL067',
             'Diaporthe_unidentified',
             'Oculimacula_yallundae-CCL031',
             'Oculimacula_yallundae-CCL029',
             'Dothiorella_vidmadera',
             'Quambalaria_cyanescens',
             'Entoleuca_unidentified',
             'Asteroma_CCL060',
             'Asteroma_CCL068',
             'Saccharomyces_cerevisiae',
             'Cladophialophora_unidentified',
             'Candida_albicans',
             'Candida_metapsilosis',
             'Candida_orthopsilosis',
             'Candida_parapsilosis',
             'Candida_unidentified',
             'Kluyveromyces_marxianus',
             'Pichia_kudriavzevii',
             'Pichia_membranifaciens']

species_delete = [
# 'Candida_orthopsilosis',
#                  'Candida_metapsilosis',
#                  'Aspergillus_niger'
]

mock_community = ['Puccinia_striiformis-tritici',
             'Zymoseptoria_tritici',
             'Pyrenophora_tritici-repentis',
             'Fusarium_oxysporum',
             'Tuber_brumale',
             'Cortinarius_globuliformis',
             'Aspergillus_niger',
             'Clavispora_lusitaniae',
             'Kluyveromyces_unidentified',
             'Penicillium_chrysogenum',
             'Rhodotorula_mucilaginosa',
             'Scedosporium_boydii',
             'Blastobotrys_proliferans',
             'Debaryomyces_unidentified',
             'Galactomyces_geotrichum',
             'Kodamaea_ohmeri',
             'Meyerozyma_guilliermondii',
             'Wickerhamomyces_anomalus',
             'Yamadazyma_mexicana',
             'Yamadazyma_scolyti',
             'Yarrowia_lipolytica',
             'Zygoascus_hellenicus',
             'Aspergillus_flavus',
             'Cryptococcus_zero',
             'Aspergillus_unidentified',
             'Diaporthe_CCL067',
             'Diaporthe_unidentified',
             'Oculimacula_yallundae-CCL031',
             'Oculimacula_yallundae-CCL029',
             'Dothiorella_vidmadera',
             'Quambalaria_cyanescens',
             'Entoleuca_unidentified',
             'Asteroma_CCL060',
             'Asteroma_CCL068',
             'Saccharomyces_cerevisiae',
             'Cladophialophora_unidentified',
             'Candida_albicans',
             'Candida_metapsilosis',
             'Candida_orthopsilosis',
             'Candida_parapsilosis',
             'Candida_unidentified',
             'Kluyveromyces_marxianus',
             'Pichia_kudriavzevii',
             'Pichia_membranifaciens']

In [7]:
ref_df = pd.read_csv(reference_dataframe_fn)
ref_df['name_species'] = ref_df['genus'] +"_"+ ref_df['species']

In [8]:
ref_df.name_species.tolist()

['puccinia_striiformis-tritici',
 'zymoseptoria_tritici',
 'pyrenophora_tritici-repentis',
 'fusarium_oxysporum',
 'tuber_brumale',
 'cortinarius_globuliformis',
 'aspergillus_niger',
 'clavispora_lusitaniae',
 'kluyveromyces_unidentified',
 'penicillium_chrysogenum',
 'rhodotorula_mucilaginosa',
 'scedosporium_boydii',
 'blastobotrys_proliferans',
 'debaryomyces_unidentified',
 'galactomyces_geotrichum',
 'kodamaea_ohmeri',
 'meyerozyma_guilliermondii',
 'wickerhamomyces_anomalus',
 'yamadazyma_mexicana',
 'yamadazyma_scolyti',
 'yarrowia_lipolytica',
 'zygoascus_hellenicus',
 'aspergillus_flavus',
 'cryptococcus_zero',
 'aspergillus_unidentified',
 'diaporthe_ccl067',
 'diaporthe_unidentified',
 'oculimacula_yallundae-ccl031',
 'oculimacula_yallundae-ccl029',
 'dothiorella_vidmadera',
 'quambalaria_cyanescens',
 'entoleuca_unidentified',
 'asteroma_ccl060',
 'asteroma_ccl068',
 'saccharomyces_cerevisiae',
 'cladophialophora_unidentified',
 'candida_albicans',
 'candida_metapsilosis',

In [9]:
new_db_fn = os.path.join(OUT_DIR, 'gsref.db.fasta')

In [10]:
new_db_list = []
old_db_list = []
for seq in SeqIO.parse(max_custom_database_fn, 'fasta'):
    old_db_list.append(seq.id)
    if seq.id.lower() in ref_df.name_species.tolist():
        #print(seq.id)
        seq.id = seq.name = seq.description = seq.id.lower()
        new_db_list.append(seq)
    else:
        print(seq.id)

In [11]:
if len(new_db_list) == len(old_db_list):
    SeqIO.write(new_db_list, new_db_fn, 'fasta')
else:
    print("please check!")

In [12]:
sub_db_fn = os.path.join(OUT_DIR, 'gsref.subdb.fasta')
sub_db_list = []
for seq in new_db_list:
    if seq.id not in [x.lower() for x in species_delete]:
        sub_db_list.append(seq)

In [13]:
if len(sub_db_list) + len(species_delete) == len(new_db_list):
    SeqIO.write(sub_db_list, sub_db_fn, 'fasta' )
else:
    print("please check!")

In [14]:
[x.id for x in sub_db_list]

['puccinia_striiformis-tritici',
 'zymoseptoria_tritici',
 'pyrenophora_tritici-repentis',
 'fusarium_oxysporum',
 'tuber_brumale',
 'cortinarius_globuliformis',
 'aspergillus_niger',
 'clavispora_lusitaniae',
 'kluyveromyces_unidentified',
 'penicillium_chrysogenum',
 'rhodotorula_mucilaginosa',
 'scedosporium_boydii',
 'blastobotrys_proliferans',
 'debaryomyces_unidentified',
 'galactomyces_geotrichum',
 'kodamaea_ohmeri',
 'meyerozyma_guilliermondii',
 'wickerhamomyces_anomalus',
 'yamadazyma_mexicana',
 'yamadazyma_scolyti',
 'yarrowia_lipolytica',
 'zygoascus_hellenicus',
 'aspergillus_flavus',
 'cryptococcus_zero',
 'aspergillus_unidentified',
 'diaporthe_ccl067',
 'diaporthe_unidentified',
 'oculimacula_yallundae-ccl031',
 'oculimacula_yallundae-ccl029',
 'dothiorella_vidmadera',
 'quambalaria_cyanescens',
 'entoleuca_unidentified',
 'asteroma_ccl060',
 'asteroma_ccl068',
 'saccharomyces_cerevisiae',
 'cladophialophora_unidentified',
 'candida_albicans',
 'candida_metapsilosis',

In [15]:
mock_community = [x.lower() for x in mock_community]

In [16]:
mock_community

['puccinia_striiformis-tritici',
 'zymoseptoria_tritici',
 'pyrenophora_tritici-repentis',
 'fusarium_oxysporum',
 'tuber_brumale',
 'cortinarius_globuliformis',
 'aspergillus_niger',
 'clavispora_lusitaniae',
 'kluyveromyces_unidentified',
 'penicillium_chrysogenum',
 'rhodotorula_mucilaginosa',
 'scedosporium_boydii',
 'blastobotrys_proliferans',
 'debaryomyces_unidentified',
 'galactomyces_geotrichum',
 'kodamaea_ohmeri',
 'meyerozyma_guilliermondii',
 'wickerhamomyces_anomalus',
 'yamadazyma_mexicana',
 'yamadazyma_scolyti',
 'yarrowia_lipolytica',
 'zygoascus_hellenicus',
 'aspergillus_flavus',
 'cryptococcus_zero',
 'aspergillus_unidentified',
 'diaporthe_ccl067',
 'diaporthe_unidentified',
 'oculimacula_yallundae-ccl031',
 'oculimacula_yallundae-ccl029',
 'dothiorella_vidmadera',
 'quambalaria_cyanescens',
 'entoleuca_unidentified',
 'asteroma_ccl060',
 'asteroma_ccl068',
 'saccharomyces_cerevisiae',
 'cladophialophora_unidentified',
 'candida_albicans',
 'candida_metapsilosis',

In [17]:
def subsamplereads(in_fn, out_fn, n_reads):
    command = F'reformat.sh samplereadstarget={n_reads} in={in_fn} out={out_fn}'
    out = subprocess.getstatusoutput(command)
    if out[0] == 0:
        print(F":)Completed {command}\n")
    else:
        print(F":(check one {command}!!\n")

In [18]:
n_reads = 15000

In [19]:
MC_READ_DIR = os.path.join(OUT_DIR, 'MC_READS')
if not os.path.exists(MC_READ_DIR):
    os.mkdir(MC_READ_DIR)

In [20]:
ref_df.columns

Index(['Unnamed: 0', 'species', 'genus', 'family', 'order', 'class', 'phylum',
       'kingdom', '# raw reads', '# reads after homology filtering',
       '# reads after length filtering', '# for use', 'path to raw reads',
       'path to homology filtering', 'path to length filtering',
       'path for use', 'name_species'],
      dtype='object')

In [21]:
fn_subsampling = {}
for x in mock_community:
    fn_subsampling[x] = (ref_df[(ref_df['species'] == x.split('_')[1]) & (ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])
fn_subsampling

{'puccinia_striiformis-tritici': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode01/length_restricted_for_use.fasta',
 'zymoseptoria_tritici': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode02/length_restricted_for_use.fasta',
 'pyrenophora_tritici-repentis': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode03/length_restricted_for_use.fasta',
 'fusarium_oxysporum': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode04/length_restricted_for_use.fasta',
 'tuber_brumale': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode05/length_restricted_for_use.fasta',
 'cortinarius_globuliformis': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode06/length_restricted_for_use.fasta',
 'aspergillus_niger': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcod

In [22]:
sub_reads_fn = {}
for key, value in fn_subsampling.items():
    species = key
    in_fn = value
    out_fn = os.path.join(MC_READ_DIR, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn
    # Will print 'check one' if files already produced

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode01/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/puccinia_striiformis-tritici.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode02/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/zymoseptoria_tritici.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode03/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/pyrenophora_tritici-repentis.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171103_FAH15473/barcode04/len

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171212_FAH18688/barcode08/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/quambalaria_cyanescens.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171212_FAH18688/barcode09/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/entoleuca_unidentified.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171212_FAH18688/barcode11/length_restricted_for_use.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/asteroma_ccl060.15000.fasta!!

:(check one reformat.sh samplereadstarget=15000 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/20171212_FAH18688/barcode12/length_restricted_fo

### Map with minimap against both databases

In [23]:
def minimapmapping(fasta_fn, ref_fn, out_fn, threads):
    command = F"minimap2 -x map-ont -t {threads} {ref_fn} {fasta_fn} -o {out_fn}"
    out = subprocess.getstatusoutput(command)
    if out[0] == 0:
        print(F":)Completed {command}\n")
    else:
        print(F":(check one {command}!!\n")

In [24]:
dbases_fn = {}
for x in [sub_db_fn, new_db_fn]:
    dbases_fn[x] = os.path.join(OUT_DIR, os.path.basename(x).replace('.fasta', '').replace('.','_'))
    if not os.path.exists(dbases_fn[x]):
        os.mkdir(dbases_fn[x])
dbases_fn

{'/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta': '/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb',
 '/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta': '/media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db'}

In [25]:
db_fn = sub_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    tmp_out = dbases_fn[db_fn]
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(tmp_out, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/puccinia_striiformis-tritici.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb/gsref.subdb.puccinia_striiformis-tritici.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/zymoseptoria_tritici.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb/gsref.subdb.zymoseptoria_tritici.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/pyrenophora_tritici-repentis.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/aspergillus_unidentified.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb/gsref.subdb.aspergillus_unidentified.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/diaporthe_ccl067.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb/gsref.subdb.diaporthe_ccl067.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/diaporthe_unidentified.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_subdb/gsref.subd

In [26]:
db_fn = new_db_fn
new_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    tmp_out = dbases_fn[db_fn]
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(tmp_out, F"{db_name}.{species}.minimap2.paf")
    new_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/puccinia_striiformis-tritici.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.puccinia_striiformis-tritici.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/zymoseptoria_tritici.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.zymoseptoria_tritici.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/pyrenophora_tritici-repentis.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.pyr

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/diaporthe_ccl067.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.diaporthe_ccl067.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/diaporthe_unidentified.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.diaporthe_unidentified.minimap2.paf

:)Completed minimap2 -x map-ont -t 6 /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/MC_READS/oculimacula_yallundae-ccl031.15000.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping_mock_gsref/gsref_db/gsref.db.oculimacula_yallundae-c

### Look at mapping results

In [27]:
def mapping_results(fn, species):
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
    hit_series = pd.Series(sub_df.groupby('tname')['mquality'].count().tolist()/sub_df.groupby('tname')['mquality'].count().sum(),
                      sub_df.groupby('tname')['mquality'].count().index)
    hit_series.sort_values(ascending=False, inplace=True)
    print(sub_df.qseqid.unique().shape == tmp_df.qseqid.unique().shape)
    print('##########\n')
    print(F"This was the query species: {species}\n")
    print(F"These are the results:")
    print(hit_series,'\n')
    hit_series.to_json('/media/MassStorage/tmp/TE/honours/analysis/Mapping/custom_results/%s.json' % species)

In [28]:
def pull_mapping_results(fn):
    """
    Takes a minimap2 paf and reads it in with the first 12 columns. Ignores the rest.
    Filters for each read the best hit on mquality first taking the highest value.
    Filters for each read by the number of nmatches in the second step.
    Returns a dataframe that has the tnames as index and the counts of hits as column 'count'.
    The dataframe has also the taxrank columns ['k', 'p', 'c', 'o', 'f', 'g', 's'] that are all False to start with.
    """
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
#     sub_df = sub_df[sub_df['nmatch'] == sub_df.groupby('qseqid')['nmatch'].transform(max)].reset_index(drop=True)
    hit_df = pd.DataFrame(sub_df.groupby('tname')['cscore'].count().tolist(), sub_df.groupby('tname')['cscore'].count().index, columns=['count'])
    hit_df.sort_values(by='count', ascending=False, inplace=True)
    for key in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        hit_df[key] = False
    return hit_df

In [29]:
def getquery_taxfileid(refdf_fn, species):
    """
    Takes the reference dataframe filename and the species name.
    Returns the taxfileid, which is the date/flowcellid (column 0 value) of the ref_df.
    """
    ref_df = pd.read_csv(refdf_fn)
    ref_df['name_species'] = ref_df['genus'] +"_"+ ref_df['species']
    return ref_df[ref_df.name_species == species].iloc[:,0].values[0]

In [30]:
def get_taxid_dict(taxid_fn, taxfileid):
    """
    Takes a taxonomy assignment file filename in the Qiime format and a taxonomic identifier.
    Returns the a dictionary with the taxonomic assignment at each rank.
    """
    tax_dict = {}
    with open(taxid_fn, 'r') as fh:
        for line in fh:
            if line.startswith(taxfileid):
                taxrankids = line.rstrip().split('\t')[1].split(';')
                for taxrank in taxrankids:
                    tax_dict[taxrank.split('__')[0]] = taxrank.split('__')[1]
    return tax_dict

In [31]:
def assign_taxranks_results(mapping_df, tax_fn, ref_df_fn = False):
    """
    This function assigns the taxonomic ranks for each hit in the mapping results dataframe.
    It takes a mapping_df, taxonomy assignment file, and if required a reference dataframe filename.
    Returns the mapping dataframe with assignment. 
    """
    for tname in mapping_df.index:
        if ref_df_fn:
            tmp_taxfileid = getquery_taxfileid(ref_df_fn, tname)
        else:
            tmp_taxfileid = tname
        tmp_tax_dict = get_taxid_dict(tax_fn, tmp_taxfileid)
        for key, value in tmp_tax_dict.items():
            mapping_df.loc[tname, key] = value
    return mapping_df

In [32]:
def get_accuracy_dict(mapping_df, query_tax_dict):
    """
    Summarieses the mapping accuracy of the mapping results at all taxonomic ranks.
    Takes the mapping_df with taxnomonic assignments and a taxnomic dictionary of the known query.
    Returns an accuracy dictionary for each taxnomic rank ['k', 'p', 'c', 'o', 'f', 'g', 's']. 
    Right now this function takes a qiime tax 
    """
    accuracy_dict = {}
    total_count = mapping_df['count'].sum()
    for tax_rank in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        hit_count = mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]]['count'].sum()
        accuracy_dict[tax_rank] = hit_count/total_count
    return accuracy_dict

In [33]:
### this is running the reads against the full database
### These results show which species each sample match to and the
### proportion of reads matching to each species
for species, hit_fn in new_db_mapping_fn.items():
    mapping_results(hit_fn, species)

True
##########

This was the query species: puccinia_striiformis-tritici

These are the results:
tname
puccinia_striiformis-tritici    0.994467
pyrenophora_tritici-repentis    0.000533
cortinarius_globuliformis       0.000467
saccharomyces_cerevisiae        0.000400
cryptococcus_zero               0.000400
aspergillus_niger               0.000333
yarrowia_lipolytica             0.000267
blastobotrys_proliferans        0.000267
kodamaea_ohmeri                 0.000267
clavispora_lusitaniae           0.000200
zymoseptoria_tritici            0.000200
tuber_brumale                   0.000200
yamadazyma_mexicana             0.000200
scedosporium_boydii             0.000200
penicillium_chrysogenum         0.000133
oculimacula_yallundae-ccl031    0.000133
kluyveromyces_marxianus         0.000133
candida_metapsilosis            0.000133
fusarium_oxysporum              0.000133
candida_unidentified            0.000133
rhodotorula_mucilaginosa        0.000133
pichia_membranifaciens          0.0

True
##########

This was the query species: clavispora_lusitaniae

These are the results:
tname
clavispora_lusitaniae           0.984887
candida_unidentified            0.002796
kodamaea_ohmeri                 0.001931
candida_orthopsilosis           0.000999
yamadazyma_scolyti              0.000799
candida_albicans                0.000666
candida_metapsilosis            0.000666
kluyveromyces_unidentified      0.000599
saccharomyces_cerevisiae        0.000599
kluyveromyces_marxianus         0.000533
wickerhamomyces_anomalus        0.000466
zymoseptoria_tritici            0.000399
yamadazyma_mexicana             0.000399
debaryomyces_unidentified       0.000333
pichia_kudriavzevii             0.000333
yarrowia_lipolytica             0.000333
penicillium_chrysogenum         0.000266
meyerozyma_guilliermondii       0.000266
entoleuca_unidentified          0.000266
puccinia_striiformis-tritici    0.000200
blastobotrys_proliferans        0.000200
zygoascus_hellenicus            0.000200
p

True
##########

This was the query species: galactomyces_geotrichum

These are the results:
tname
galactomyces_geotrichum          0.984758
candida_metapsilosis             0.000877
pyrenophora_tritici-repentis     0.000809
entoleuca_unidentified           0.000809
diaporthe_ccl067                 0.000809
candida_albicans                 0.000809
yamadazyma_mexicana              0.000674
wickerhamomyces_anomalus         0.000674
kodamaea_ohmeri                  0.000607
debaryomyces_unidentified        0.000540
kluyveromyces_unidentified       0.000540
blastobotrys_proliferans         0.000540
yarrowia_lipolytica              0.000540
zygoascus_hellenicus             0.000472
pichia_kudriavzevii              0.000472
saccharomyces_cerevisiae         0.000472
clavispora_lusitaniae            0.000405
yamadazyma_scolyti               0.000405
candida_parapsilosis             0.000405
dothiorella_vidmadera            0.000337
aspergillus_flavus               0.000337
tuber_brumale      

True
##########

This was the query species: aspergillus_unidentified

These are the results:
tname
aspergillus_unidentified         0.667932
aspergillus_niger                0.319217
aspergillus_flavus               0.006526
penicillium_chrysogenum          0.001065
entoleuca_unidentified           0.000533
oculimacula_yallundae-ccl031     0.000466
zymoseptoria_tritici             0.000400
dothiorella_vidmadera            0.000333
diaporthe_unidentified           0.000333
saccharomyces_cerevisiae         0.000266
tuber_brumale                    0.000266
asteroma_ccl068                  0.000266
cladophialophora_unidentified    0.000266
oculimacula_yallundae-ccl029     0.000266
diaporthe_ccl067                 0.000200
debaryomyces_unidentified        0.000200
kluyveromyces_marxianus          0.000200
puccinia_striiformis-tritici     0.000133
quambalaria_cyanescens           0.000133
fusarium_oxysporum               0.000133
blastobotrys_proliferans         0.000133
asteroma_ccl060   

True
##########

This was the query species: asteroma_ccl068

These are the results:
tname
asteroma_ccl068                  0.556854
diaporthe_ccl067                 0.431663
asteroma_ccl060                  0.001792
aspergillus_flavus               0.001394
pichia_membranifaciens           0.001328
diaporthe_unidentified           0.001062
entoleuca_unidentified           0.001062
oculimacula_yallundae-ccl029     0.000797
oculimacula_yallundae-ccl031     0.000664
scedosporium_boydii              0.000597
quambalaria_cyanescens           0.000465
cryptococcus_zero                0.000398
aspergillus_unidentified         0.000398
dothiorella_vidmadera            0.000332
fusarium_oxysporum               0.000266
aspergillus_niger                0.000266
saccharomyces_cerevisiae         0.000133
galactomyces_geotrichum          0.000133
penicillium_chrysogenum          0.000066
rhodotorula_mucilaginosa         0.000066
candida_unidentified             0.000066
cladophialophora_unidentifi

True
##########

This was the query species: candida_unidentified

These are the results:
tname
candida_unidentified             0.818551
candida_albicans                 0.166079
candida_metapsilosis             0.002861
candida_orthopsilosis            0.002063
meyerozyma_guilliermondii        0.001530
debaryomyces_unidentified        0.001131
candida_parapsilosis             0.000865
yamadazyma_mexicana              0.000798
saccharomyces_cerevisiae         0.000732
kluyveromyces_unidentified       0.000665
oculimacula_yallundae-ccl029     0.000665
entoleuca_unidentified           0.000599
blastobotrys_proliferans         0.000466
kodamaea_ohmeri                  0.000466
yarrowia_lipolytica              0.000333
wickerhamomyces_anomalus         0.000266
yamadazyma_scolyti               0.000266
pichia_membranifaciens           0.000200
tuber_brumale                    0.000200
oculimacula_yallundae-ccl031     0.000133
zygoascus_hellenicus             0.000133
diaporthe_unidentified

In [34]:
### These results show which the proportion of reads that match
### the sample label at each taxonomic rank

for species in mock_community:
    print(species)
    mapping_results = pull_mapping_results(sub_db_mapping_fn[species])
    
    mapping_results = assign_taxranks_results(mapping_results, taxonomy_file_fn, ref_df_fn=reference_dataframe_fn)

    taxfileid = getquery_taxfileid(reference_dataframe_fn, species)

    query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)

    sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)

    print(json.dumps(sensitivity_dict, indent=1))
    
    with open('/media/MassStorage/tmp/TE/honours/analysis/Mapping/custom_results/tax_rank/%s.json' % species,'w+') as json_file:
        json.dump(sensitivity_dict,json_file)

puccinia_striiformis-tritici
{
 "k": 1.0,
 "p": 0.9955339288094921,
 "c": 0.9944674043460872,
 "o": 0.9944674043460872,
 "f": 0.9944674043460872,
 "g": 0.9944674043460872,
 "s": 0.9944674043460872
}
zymoseptoria_tritici
{
 "k": 1.0,
 "p": 0.9996670218433671,
 "c": 0.9938066062866276,
 "o": 0.9915423548215238,
 "f": 0.9915423548215238,
 "g": 0.9915423548215238,
 "s": 0.9915423548215238
}
pyrenophora_tritici-repentis
{
 "k": 1.0,
 "p": 0.9996680167319567,
 "c": 0.9907708651483965,
 "o": 0.9568421751543722,
 "f": 0.9568421751543722,
 "g": 0.9568421751543722,
 "s": 0.9568421751543722
}
fusarium_oxysporum
{
 "k": 1.0,
 "p": 0.9996061700032819,
 "c": 0.9811617984903184,
 "o": 0.9725631768953069,
 "f": 0.9725631768953069,
 "g": 0.9725631768953069,
 "s": 0.9725631768953069
}
tuber_brumale
{
 "k": 1.0,
 "p": 0.9994016753091344,
 "c": 0.9900279218189071,
 "o": 0.9900279218189071,
 "f": 0.9900279218189071,
 "g": 0.9900279218189071,
 "s": 0.9900279218189071
}
cortinarius_globuliformis
{
 "k": 1.0,

{
 "k": 1.0,
 "p": 0.9994680143636122,
 "c": 0.9963425987498338,
 "o": 0.9963425987498338,
 "f": 0.9908897459768586,
 "g": 0.9908897459768586,
 "s": 0.9872988429312408
}


### Notebook to analyzse the efficiency of minimap2 mapping for Oxford Nanopore MinION reads against Qiime2 Database

In [35]:
os.path.abspath(os.curdir)

'/media/MassStorage/tmp/TE/honours/scripts/Notebooks'

In [36]:
qiime_db_fn = os.path.abspath('../../analysis/qiime2/db/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta')
qiime_tax_fn = os.path.abspath('../../analysis/qiime2/db/sh_taxonomy_qiime_ver8_dynamic_02.02.2019.txt')
threads = 10
QIIME_DIR = os.path.abspath('../../analysis/qiime2/')

In [37]:
##mapping folder
mapping_dir = os.path.join(QIIME_DIR, os.path.basename(qiime_db_fn).replace('.fasta', '').replace('.','_'))
if not os.path.exists(mapping_dir):
    os.mkdir(mapping_dir)
subsampling_dir = os.path.join(QIIME_DIR, 'subsamplereads')
if not os.path.exists(subsampling_dir):
    os.mkdir(subsampling_dir)

In [38]:
import json
from collections import OrderedDict

def get_accuracy_dict(mapping_df, query_tax_dict):
    """
    Summarises the mapping accuracy of the mapping results at all taxonomic ranks.
    Takes the mapping_df with taxnomonic assignments and a taxnomic dictionary of the known query.
    Returns an accuracy dictionary for each taxnomic rank ['k', 'p', 'c', 'o', 'f', 'g', 's']. 
    Right now this function takes a qiime tax 
    """
    accuracy_dict = OrderedDict()
    total_count = mapping_df['count'].sum()
    for tax_rank in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        tmps_df = pd.DataFrame(data=None)
        if tax_rank == 's':
            for index, row in mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]].iterrows():
                if row['s'] == 'unidentified' and row['g'] != query_tax_dict['g']:
                    mapping_df.drop(index, axis=0, inplace=True)
                else:
                    continue
            hit_count = mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]]['count'].sum()
        else:
            hit_count = mapping_df[mapping_df[tax_rank] == query_tax_dict[tax_rank]]['count'].sum()
        accuracy_dict[tax_rank] = hit_count/total_count
    return accuracy_dict

def minimapmapping(fasta_fn, ref_fn, out_fn, threads):
    command = F"minimap2 -x map-ont -t {threads} {ref_fn} {fasta_fn} -o {out_fn}"
    out = subprocess.getstatusoutput(command)

def pull_mapping_results_v3(fn):
    """
    Takes a minimap2 paf and reads it in with the first 12 columns. Ignores the rest.
    Filters for each read the best hit on mquality first taking the highest value.
    Filters for each read by the number of nmatches in the second step.
    Returns a dataframe that has the tnames as index and the counts of hits as column 'count'.
    The dataframe has also the taxrank columns ['k', 'p', 'c', 'o', 'f', 'g', 's'] that are all False to start with.
    """
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
#     sub_df = sub_df[sub_df['nmatch'] == sub_df.groupby('qseqid')['nmatch'].transform(max)].reset_index(drop=True)
    hit_df = pd.DataFrame(sub_df.groupby('tname')['cscore'].count().tolist(), sub_df.groupby('tname')['cscore'].count().index, columns=['count'])
    hit_df.sort_values(by='count', ascending=False, inplace=True)
    for key in ['k', 'p', 'c', 'o', 'f', 'g', 's']:
        hit_df[key] = False
        tmp_df[key] = False
    return hit_df, tmp_df
    
def subsamplereads(in_fn, out_fn, n_reads):
    command = F'reformat.sh samplereadstarget={n_reads} in={in_fn} out={out_fn}'
    out = subprocess.getstatusoutput(command)

test_species_list = []
for entry in ref_df.name_species.tolist():
#     if entry[-7:] != '-ccl031' and entry[-7:] != '-ccl029':
#         test_species_list.append(entry)
#     else:
#         test_species_list.append(entry[:-7])
#         print(entry[:-7])
    test_species_list.append(entry)
    
for test_species in test_species_list:
    
    print(test_species)
    
    #subsample tests species
    fn_subsampling = {}
    test_species = [test_species]
    for x in test_species:
        fn_subsampling[x] = (ref_df[(ref_df['species'] == x.split('_')[1]) & (ref_df['genus'] == x.split('_')[0])]['path for use'].tolist()[0])
        fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])

    sub_reads_fn = {}
    n_reads = 15000
    for key, value in fn_subsampling.items():
        species = key
        in_fn = value
        out_fn = os.path.join(subsampling_dir, F'{species}.{n_reads}.fasta')
        subsamplereads(in_fn, out_fn, n_reads)
        sub_reads_fn[species] = out_fn
        
    ###Map the reads
    db_fn = qiime_db_fn
    sub_db_mapping_fn = {}
    for species, fasta_fn in sub_reads_fn.items():
        db_name = os.path.basename(db_fn).replace('.fasta', '')
        out_fn = os.path.join(mapping_dir, F"{db_name}.{species}.minimap2.paf")
        sub_db_mapping_fn[species] = out_fn
        minimapmapping(fasta_fn, db_fn, out_fn, threads)
        
    ###Test out the summary results statistic for a single mapping result
    species = test_species[0]
    mapping_results , full_results_df = pull_mapping_results_v3(sub_db_mapping_fn[species])
    mapping_results = assign_taxranks_results(mapping_results, qiime_tax_fn)
    taxfileid = getquery_taxfileid(reference_dataframe_fn, species)
    
    query_tax_dict = get_taxid_dict(taxonomy_file_fn, taxfileid)

    print(query_tax_dict)
    
    sensitivity_dict = get_accuracy_dict(mapping_results, query_tax_dict)
        
    print(json.dumps(sensitivity_dict, indent=1))
    
    with open('/media/MassStorage/tmp/TE/honours/analysis/Mapping/qiime_results/tax_rank/%s.json' % species,'w+') as json_file:
        json.dump(sensitivity_dict,json_file)

puccinia_striiformis-tritici
{'k': 'Fungi', 'p': 'Basidiomycota', 'c': 'Pucciniomycetes', 'o': 'Pucciniales', 'f': 'Pucciniaceae', 'g': 'Puccinia', 's': 'Puccinia_striiformis'}
{
 "k": 1.0,
 "p": 0.9562130177514793,
 "c": 0.8740959894806049,
 "o": 0.8740959894806049,
 "f": 0.8723208415516108,
 "g": 0.8619986850756082,
 "s": 0.801051939513478
}
zymoseptoria_tritici
{'k': 'Fungi', 'p': 'Ascomycota', 'c': 'Dothideomycetes', 'o': 'Capnodiales', 'f': 'Mycosphaerellaceae', 'g': 'Zymoseptoria', 's': 'Zymoseptoria_tritici'}
{
 "k": 1.0,
 "p": 0.8341285085816965,
 "c": 0.35993555982402875,
 "o": 0.18811574446991758,
 "f": 0.1839023483487205,
 "g": 0.16159613358944172,
 "s": 0.0
}
pyrenophora_tritici-repentis
{'k': 'Fungi', 'p': 'Ascomycota', 'c': 'Dothideomycetes', 'o': 'Pleosporales', 'f': 'Pleosporaceae', 'g': 'Pyrenophora', 's': 'Pyrenophora_tritici-repentis'}
{
 "k": 1.0,
 "p": 0.844637829536729,
 "c": 0.6033401586895316,
 "o": 0.5959175838239058,
 "f": 0.35865113898131556,
 "g": 0.30451753

{'k': 'Fungi', 'p': 'Ascomycota', 'c': 'Eurotiomycetes', 'o': 'Eurotiales', 'f': 'Aspergillaceae', 'g': 'Aspergillus', 's': 'unidentified'}
{
 "k": 1.0,
 "p": 0.9483128364479846,
 "c": 0.6535078121580813,
 "o": 0.6524574379622741,
 "f": 0.6523699067792901,
 "g": 0.5191036806862445,
 "s": 0.03221147533808919
}
diaporthe_ccl067
{'k': 'Fungi', 'p': 'Ascomycota', 'c': 'Sordariomycetes', 'o': 'Diaporthales', 'f': 'Diaporthaceae', 'g': 'Diaporthe', 's': 'Diaporthe_CCL067'}
{
 "k": 1.0,
 "p": 0.9729810508697472,
 "c": 0.7845657241915237,
 "o": 0.47934724131747264,
 "f": 0.004722338454181361,
 "g": 0.004722338454181361,
 "s": 0.0
}
diaporthe_unidentified
{'k': 'Fungi', 'p': 'Ascomycota', 'c': 'Sordariomycetes', 'o': 'Diaporthales', 'f': 'Diaporthaceae', 'g': 'Diaporthe', 's': 'unidentified'}
{
 "k": 1.0,
 "p": 0.9779748794571106,
 "c": 0.7123638311804275,
 "o": 0.311387582594202,
 "f": 0.27424251443538306,
 "g": 0.2722781117923686,
 "s": 0.024822906125364606
}
oculimacula_yallundae-ccl031
{'k'

In [116]:
from Bio import SeqIO
import os
import random
import subprocess
import pandas as pd

INPUT_BASEDIR = os.path.abspath('/media/MassStorage/tmp/TE/honours')
subsampling_dir = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/subsample_reads')
mapping_dir = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/qiime_results')

wheat_reference_dataframe_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/wheat_reference_dataframe_v3.csv')
wheat_max_custom_database_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/database/wheat_database_labelled.fasta')
wheat_taxonomy_file_fn = os.path.abspath('/media/MassStorage/tmp/TE/honours/analysis/Stats/wheat_taxonomy_file_qiime.csv')

qiime_db_fn = os.path.abspath('../../analysis/qiime2/db/sh_refs_qiime_ver8_dynamic_02.02.2019.fasta')
qiime_tax_fn = os.path.abspath('../../analysis/qiime2/db/sh_taxonomy_qiime_ver8_dynamic_02.02.2019.txt')
threads = 10

wheat_ref_df = pd.read_csv(wheat_reference_dataframe_fn)
wheat_ref_df

Unnamed: 0.1,Unnamed: 0,barcode,# raw reads,# reads after homology filtering,# reads after length filtering,# for use,path to raw reads,path to homology filtering,path to length filtering,path for use
0,barcode01,barcode01,115052,23477,21590,21590.0,analysis/Concatenated/wheat/barcode01/merged.f...,analysis/Python_Processing/wheat/barcode01/com...,analysis/Length_Filtered/wheat/barcode01/lengt...,analysis/Length_Filtered/wheat/barcode01/lengt...
1,barcode02,barcode02,151005,20416,18100,18100.0,analysis/Concatenated/wheat/barcode02/merged.f...,analysis/Python_Processing/wheat/barcode02/com...,analysis/Length_Filtered/wheat/barcode02/lengt...,analysis/Length_Filtered/wheat/barcode02/lengt...
2,barcode03,barcode03,92927,23616,21919,21919.0,analysis/Concatenated/wheat/barcode03/merged.f...,analysis/Python_Processing/wheat/barcode03/com...,analysis/Length_Filtered/wheat/barcode03/lengt...,analysis/Length_Filtered/wheat/barcode03/lengt...
3,barcode04,barcode04,115407,22716,20568,20568.0,analysis/Concatenated/wheat/barcode04/merged.f...,analysis/Python_Processing/wheat/barcode04/com...,analysis/Length_Filtered/wheat/barcode04/lengt...,analysis/Length_Filtered/wheat/barcode04/lengt...
4,barcode05,barcode05,134094,24785,22433,22433.0,analysis/Concatenated/wheat/barcode05/merged.f...,analysis/Python_Processing/wheat/barcode05/com...,analysis/Length_Filtered/wheat/barcode05/lengt...,analysis/Length_Filtered/wheat/barcode05/lengt...
5,barcode06,barcode06,150739,25462,23769,23769.0,analysis/Concatenated/wheat/barcode06/merged.f...,analysis/Python_Processing/wheat/barcode06/com...,analysis/Length_Filtered/wheat/barcode06/lengt...,analysis/Length_Filtered/wheat/barcode06/lengt...
6,barcode07,barcode07,109201,24688,22931,22931.0,analysis/Concatenated/wheat/barcode07/merged.f...,analysis/Python_Processing/wheat/barcode07/com...,analysis/Length_Filtered/wheat/barcode07/lengt...,analysis/Length_Filtered/wheat/barcode07/lengt...
7,barcode08,barcode08,114215,21725,20105,20105.0,analysis/Concatenated/wheat/barcode08/merged.f...,analysis/Python_Processing/wheat/barcode08/com...,analysis/Length_Filtered/wheat/barcode08/lengt...,analysis/Length_Filtered/wheat/barcode08/lengt...
8,barcode09,barcode09,106976,22012,20451,20451.0,analysis/Concatenated/wheat/barcode09/merged.f...,analysis/Python_Processing/wheat/barcode09/com...,analysis/Length_Filtered/wheat/barcode09/lengt...,analysis/Length_Filtered/wheat/barcode09/lengt...
9,barcode10,barcode10,108583,21514,19810,19810.0,analysis/Concatenated/wheat/barcode10/merged.f...,analysis/Python_Processing/wheat/barcode10/com...,analysis/Length_Filtered/wheat/barcode10/lengt...,analysis/Length_Filtered/wheat/barcode10/lengt...


In [122]:
mock_community = ['barcode01','barcode02',
                  'barcode03','barcode04',
                 'barcode05','barcode06',
                  'barcode07','barcode08',
                  'barcode09','barcode10',
                  'barcode11','barcode12',
                 ]

In [123]:
def subsamplereads(in_fn, out_fn, n_reads):
    command = F'reformat.sh samplereadstarget={n_reads} in={in_fn} out={out_fn}'
    out = subprocess.getstatusoutput(command)
    if out[0] == 0:
        print(F":)Completed {command}\n")
    else:
        print(F":(check one {command}!!\n")

In [124]:
OUT_DIR = os.path.abspath('../../analysis/Mapping/wheat')
if not os.path.exists(OUT_DIR):
    os.mkdir(OUT_DIR)
MC_READ_DIR = os.path.join(OUT_DIR, 'MC_READS')
if not os.path.exists(MC_READ_DIR):
    os.mkdir(MC_READ_DIR)
sub_db_fn = os.path.join('../../analysis/Mapping/gsref.subdb.fasta')
new_db_fn = os.path.join('../../analysis/Mapping/gsref.db.fasta')

In [125]:
wheat_ref_df.columns

Index(['Unnamed: 0', 'barcode', '# raw reads',
       '# reads after homology filtering', '# reads after length filtering',
       '# for use', 'path to raw reads', 'path to homology filtering',
       'path to length filtering', 'path for use'],
      dtype='object')

In [126]:
fn_subsampling = {}
for x in mock_community:
    print(x)
    fn_subsampling[x] = wheat_ref_df[wheat_ref_df['Unnamed: 0'] == x]['path for use'].tolist()[0]
    fn_subsampling[x] = os.path.join(INPUT_BASEDIR, fn_subsampling[x])
fn_subsampling

barcode01
barcode02
barcode03
barcode04
barcode05
barcode06
barcode07
barcode08
barcode09
barcode10
barcode11
barcode12


{'barcode01': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode01/length_restricted_reads.fasta',
 'barcode02': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode02/length_restricted_reads.fasta',
 'barcode03': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode03/length_restricted_reads.fasta',
 'barcode04': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode04/length_restricted_reads.fasta',
 'barcode05': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode05/length_restricted_reads.fasta',
 'barcode06': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode06/length_restricted_reads.fasta',
 'barcode07': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode07/length_restricted_reads.fasta',
 'barcode08': '/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode08/length_restricted_reads.fasta',
 'barcode09': '/media/Ma

In [127]:
sub_reads_fn = {}
for key, value in fn_subsampling.items():
    print(key)
    print(value)
    species = key
    in_fn = value
    out_fn = os.path.join(MC_READ_DIR, F'{species}.{n_reads}.fasta')
    subsamplereads(in_fn, out_fn, n_reads)
    sub_reads_fn[species] = out_fn

barcode01
/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode01/length_restricted_reads.fasta
:(check one reformat.sh samplereadstarget=200 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode01/length_restricted_reads.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode01.200.fasta!!

barcode02
/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode02/length_restricted_reads.fasta
:(check one reformat.sh samplereadstarget=200 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode02/length_restricted_reads.fasta out=/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode02.200.fasta!!

barcode03
/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode03/length_restricted_reads.fasta
:(check one reformat.sh samplereadstarget=200 in=/media/MassStorage/tmp/TE/honours/analysis/Length_Filtered/wheat/barcode03/length_restricted_reads.fasta out=/me

In [128]:
def minimapmapping(fasta_fn, ref_fn, out_fn, threads):
    command = F"minimap2 -x map-ont -t {threads} {ref_fn} {fasta_fn} -o {out_fn}"
    out = subprocess.getstatusoutput(command)
    if out[0] == 0:
        print(F":)Completed {command}\n")
    else:
        print(F":(check one {command}!!\n")

In [129]:
dbases_fn = {}
for x in [sub_db_fn, new_db_fn]:
    print(x)
    dbases_fn[x] = os.path.join(OUT_DIR, os.path.basename(x).replace('.fasta', '').replace('.','_'))
    if not os.path.exists(dbases_fn[x]):
        os.mkdir(dbases_fn[x])
dbases_fn

../../analysis/Mapping/gsref.subdb.fasta
../../analysis/Mapping/gsref.db.fasta


{'../../analysis/Mapping/gsref.subdb.fasta': '/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_subdb',
 '../../analysis/Mapping/gsref.db.fasta': '/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_db'}

In [130]:
db_fn = sub_db_fn
sub_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    tmp_out = dbases_fn[db_fn]
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(tmp_out, F"{db_name}.{species}.minimap2.paf")
    sub_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode01.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_subdb/gsref.subdb.barcode01.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode02.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_subdb/gsref.subdb.barcode02.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode03.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_subdb/gsref.subdb.barcode03.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.subdb.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode04.200.fasta -o /media/MassSt

In [131]:
db_fn = new_db_fn
new_db_mapping_fn = {}
for species, fasta_fn in sub_reads_fn.items():
    tmp_out = dbases_fn[db_fn]
    db_name = os.path.basename(db_fn).replace('.fasta', '')
    out_fn = os.path.join(tmp_out, F"{db_name}.{species}.minimap2.paf")
    new_db_mapping_fn[species] = out_fn
    minimapmapping(fasta_fn, db_fn, out_fn, threads)

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode01.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_db/gsref.db.barcode01.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode02.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_db/gsref.db.barcode02.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode03.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/gsref_db/gsref.db.barcode03.minimap2.paf

:)Completed minimap2 -x map-ont -t 10 ../../analysis/Mapping/gsref.db.fasta /media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/MC_READS/barcode04.200.fasta -o /media/MassStorage/tmp/TE/honours/analysis/

In [132]:
def mapping_results_v2(fn, species,expected_species, tax_rank):
    import numpy as np
    min_header = ['qseqid', 'qlen', 'qstart', 'qstop', 'strand', 'tname', 'tlen', 'tstart', 'tend', 'nmatch', 'alen', 'mquality']
    tmp_df = pd.read_csv(fn, sep='\t', header = None, usecols=[x for x in range(0,12)], names=min_header)
    tmp_df['cscore'] = tmp_df['alen']/(tmp_df['alen']-tmp_df['nmatch'])
    sub_df = tmp_df[tmp_df['cscore'] == tmp_df.groupby('qseqid')['cscore'].transform(max)].reset_index(drop=True)
    hit_series = pd.Series(sub_df.groupby('tname')['mquality'].count().tolist()/sub_df.groupby('tname')['mquality'].count().sum(),
                      sub_df.groupby('tname')['mquality'].count().index)
    if tax_rank == 's' or tax_rank == 'species':
        new_new_hit_series = hit_series
        new_new_hit_series.sort_values(ascending=False, inplace=True)
    else:
        new_hit_series = pd.Series()
        new_new_hit_series = pd.Series()
        for index in hit_series.index.unique():
            new_hit_series.at[index] = data=np.sum(hit_series[index])
        for index in new_hit_series.index.unique():
            new_new_hit_series.at[index] = data=np.sum(new_hit_series[index])
        new_new_hit_series.sort_values(ascending=False, inplace=True)
    print(sub_df.qseqid.unique().shape == tmp_df.qseqid.unique().shape)
    print('##########\n')
    print(F"This was the sample type expected:", barcode,"\n")
    print(F"These are the results:")
    print(new_new_hit_series,'\n')
    new_new_hit_series.to_json('/media/MassStorage/tmp/TE/honours/analysis/Mapping/wheat/custom_results/%s_%s.json' % (tax_rank,expected_species[species]))

In [133]:
expected_rank_s = {}
expected_rank_g = {}
expected_rank_f = {}
expected_rank_o = {}
expected_rank_c = {}
expected_rank_p = {}

for column, row in wheat_ref_df.iterrows():
    expected_rank_s[row['Unnamed: 0']] = row['barcode']

In [134]:
###this is running the reads against the full database
for barcode, hit_fn in new_db_mapping_fn.items():
    mapping_results_v2(hit_fn, barcode, expected_rank_s, 's')

True
##########

This was the sample type expected: barcode01 

These are the results:
tname
cryptococcus_zero                0.582090
blastobotrys_proliferans         0.208955
cortinarius_globuliformis        0.029851
saccharomyces_cerevisiae         0.024876
aspergillus_unidentified         0.014925
cladophialophora_unidentified    0.014925
entoleuca_unidentified           0.009950
dothiorella_vidmadera            0.009950
candida_orthopsilosis            0.009950
candida_unidentified             0.009950
debaryomyces_unidentified        0.009950
aspergillus_flavus               0.009950
quambalaria_cyanescens           0.009950
yamadazyma_mexicana              0.009950
yarrowia_lipolytica              0.004975
meyerozyma_guilliermondii        0.004975
oculimacula_yallundae-ccl029     0.004975
penicillium_chrysogenum          0.004975
pichia_membranifaciens           0.004975
rhodotorula_mucilaginosa         0.004975
candida_metapsilosis             0.004975
tuber_brumale            

True
##########

This was the sample type expected: barcode12 

These are the results:
tname
cryptococcus_zero            0.666667
cortinarius_globuliformis    0.333333
dtype: float64 

