# Fig 3. Comparison between ARG profiling methods

####  Dependencies:
    - flye==2.9.3
    - semibin==2.1.0
    - kraken2==2.1.3
    - gtdbtk==2.4.0
    - argo==0.1.0
    - pandas
    - tqdm
    - taxonkit

#### Inputs:
    - fig3/data/*_*x.fa (downsampled 1–32x HQ and LQ)

In [1]:
# %%bash
# mkdir -p fig3/tmp
# for file in fig3/data/*.fa
# do
#     filename=${file##*/}
#     filename=${filename%%.*}
    
#     ## metaflye
#     flye --nano-raw $file \
#         --out-dir fig3/tmp/$filename \
#         --meta \
#         --threads 32
    
#     ## semibin
#     minimap2 -ax map-ont -t 32 fig3/tmp/$filename/assembly.fasta $file | samtools sort > fig3/tmp/$filename/assembly.bam
#     SemiBin2 single_easy_bin \
#         --sequencing-type=long_read \
#         --environment global \
#         -i fig3/tmp/$filename/assembly.fasta \
#         -o fig3/tmp/$filename \
#         -t 32 \
#         -b fig3/tmp/$filename/assembly.bam \
#         --engine cpu

#     ## get ARG types/subtypes of contigs
#     diamond blastx \
#         -q fig3/tmp/$filename/assembly.fasta --threads 32 \
#         -d db/argo/sarg.dmnd -o fig3/tmp/$filename/sarg.tsv \
#         --id 90 --subject-cover 90 -e 1e-15 \
#         -F15 --range-culling --range-cover 25 \
#         --max-hsps 0 --max-target-seqs 25 \
#         --outfmt 6 qseqid sseqid pident length qlen qstart qend slen sstart send evalue bitscore --quiet

#     ## kraken
#     kraken2 fig3/tmp/$filename/assembly.fasta \
#         --db db/kraken-pluspf \
#         --threads 32 \
#         --output fig3/tmp/$filename/assembly.output --use-names \
#         --report fig3/tmp/$filename/assembly.report

#     ## gtdbtk
#     gtdbtk classify_wf \
#         --genome_dir fig3/tmp/$filename/output_bins/ \
#         --cpus 32 \
#         --out_dir fig3/tmp/$filename/gtdbtk \
#         --extension fa.gz \
#         --mash_db db/gtdbtk/release220/mash/

#     ## argo
#     argo $file -d db/argo -o fig3/tmp/$filename -t 32 --plasmid
# done

In [2]:
import pandas as pd
import subprocess
import glob

from melon.utils import *
from collections import defaultdict
from scipy.spatial.distance import cityblock, euclidean
from scipy.stats import pearsonr, spearmanr
from tqdm import tqdm

def parse_diamond(file):
    qcoords = defaultdict(set)
    hits = []
    with open(file) as f:
        for line in f:
            ls = line.rstrip().split('\t')
            qseqid, sseqid = ls[0], ls[1]
            qstart, qend = sort_coordinate(int(ls[5]), int(ls[6]))
            sstart, send = sort_coordinate(int(ls[8]), int(ls[9]))
            slen = int(ls[7])
            qcovhsp, scovhsp = float(ls[-2]), float(ls[-1])
            if (
                qseqid not in qcoords or
                all(compute_overlap((qstart, qend, *qcoord), max) < 0.25 for qcoord in qcoords[qseqid])
            ):
                qcoords[qseqid].add((qstart, qend))
    
                ## append qseqid and coordinates for back-tracing
                hits.append([qseqid, sseqid, qstart, qend, float(ls[2])])
    return(hits)

def get_taxonomy(taxid):
    output = subprocess.run([
        'taxonkit', 'reformat',
        '--taxid-field', '1',
        '--show-lineage-taxids',
        '--fill-miss-rank',
        '--miss-taxid-repl', '0',
        '--miss-rank-repl', 'unclassified',
        '--trim',
        '-f', '{k}\t{p}\t{c}\t{o}\t{f}\t{g}\t{s}'],
        input='\n'.join(taxid)+'\n', text=True, capture_output=True, check=True)

    taxonomy = {}
    for line in output.stdout.rstrip().split('\n'):
        ls = line.rstrip().split('\t')
        taxonomy[int(ls[0])] = ';'.join([ls[i+7] + '|' + ls[i] for i in range(1, len(ls)-7)])

    return taxonomy

def pr(df):
    df = df[df.species!='unclassified']
    TP = len(df[(df['est']!=0) & (df['cnt']!=0)])
    TPFP = len(df[df['est']!=0])
    TPFN = len(df[df['cnt']!=0])
    precision, recall = TP/TPFP, TP/TPFN
    return precision, recall

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def parse_kraken(filename):
    ctg = []
    with open(f'fig3/tmp/{filename}/assembly.output') as f:
        for line in f:
            ls = line.split()
            contig = line.split('\t')[1]
            taxid = line.split('(taxid ')[-1].split(')')[0]
            ctg.append([contig, int(taxid)])

    ctg = pd.DataFrame(ctg, columns = ['contig', 'taxid'])
    ctg['taxonomy'] = ctg['taxid'].map(get_taxonomy(ctg['taxid'].astype(str).unique()))
    ctg['species'] = ctg.taxonomy.str.split('|').str.get(-1)

    c2kraken = ctg.set_index('contig').species.to_dict()

    sarg = pd.DataFrame(parse_diamond(f'fig3/tmp/{filename}/sarg.tsv'), columns = ['contig', 'sarg', 'qstart', 'qend', 'pident'])
    sarg['type'] = sarg['sarg'].str.split('|').str.get(1).str.split('@').str.get(0)
    sarg['subtype'] = sarg['sarg'].str.split('|').str.get(2)
    sarg['species'] = sarg.contig.map(c2kraken)
    
    return (
        sarg.groupby(['species', 'type'], as_index=False).size().rename(columns={'size': 'est'}), 
        sarg.groupby(['species', 'subtype'], as_index=False).size().rename(columns={'size': 'est'})
    )

def parse_gtdb(filename):
    c2b = pd.read_table(f'fig3/tmp/{filename}/contig_bins.tsv').set_index('contig').bin.to_dict()
    gtdb = pd.read_table(f'fig3/tmp/{filename}/gtdbtk/gtdbtk.bac120.summary.tsv')
    gtdb['bin'] = gtdb.user_genome.str.split('_').str.get(-1).astype(int)
    gtdb['species'] = gtdb.classification.str.split('s__').str.get(-1)
    b2gtdb = gtdb.set_index('bin').species.to_dict()
    c2gtdb = {x: b2gtdb.get(y) for x,y in c2b.items()}
    
    sarg = pd.DataFrame(parse_diamond(f'fig3/tmp/{filename}/sarg.tsv'), columns = ['contig', 'sarg', 'qstart', 'qend', 'pident'])
    sarg['type'] = sarg['sarg'].str.split('|').str.get(1).str.split('@').str.get(0)
    sarg['subtype'] = sarg['sarg'].str.split('|').str.get(2)
    sarg['species'] = sarg.contig.map(c2gtdb)

    sarg.species = sarg.species.fillna('unclassified').apply(lambda x: re.sub('_[A-Z]','',x))
    sarg.species = sarg.species.apply(lambda x: {
        'Bacillus thuringiensis': 'Bacillus cereus',
        '': 'unclassified',
        'Unclassified Bacteria': 'unclassified'
    }.get(x,x))
    
    return (
        sarg.groupby(['species', 'type'], as_index=False).size().rename(columns={'size': 'est'}), 
        sarg.groupby(['species', 'subtype'], as_index=False).size().rename(columns={'size': 'est'})
    )

def parse_argo(filename):
    sarg = pd.read_table(f'fig3/tmp/{filename}/{filename}.sarg.tsv')
    sarg['species'] = sarg['lineage'].str.split(';').str.get(-1)
    sarg.species = sarg.species.apply(lambda x: re.sub('_[A-Z]','',x))
    sarg.species = sarg.species.apply(lambda x: {'Bacillus thuringiensis': 'Bacillus cereus'}.get(x,x))

    return (
        sarg.groupby(['species', 'type'], as_index=False)['abundance'].sum().rename(columns={'abundance': 'est'}), 
        sarg.groupby(['species', 'subtype'], as_index=False)['abundance'].sum().rename(columns={'abundance': 'est'}), 
    )

def parse_real():
    sarg = pd.read_table('sarg/pathogen.sarg.tsv')
    return (
        sarg.groupby(['species', 'type'], as_index=False)['cnt'].sum(),
        sarg.groupby(['species', 'subtype'], as_index=False)['cnt'].sum()
    )

In [4]:
tval = []
sval = []

tstat = []
sstat = []
for filename in tqdm(glob.glob('fig3/data/*.fa')):
    filename = filename.split('/')[-1].split('.')[0]
    sample = filename.split('_')[0]
    coverage = int(filename.split('_')[1].split('x')[0])

    real_type, real_subtype = parse_real()
    read_type, read_subtype = parse_argo(filename)
    assembly_type, assembly_subtype = parse_kraken(filename)
    binning_type, binning_subtype = parse_gtdb(filename)

    a = pd.merge(real_type, read_type, how='outer').assign(method='read-based').fillna(0)
    b = pd.merge(real_type, assembly_type, how='outer').assign(method='assembly-based').fillna(0)
    c = pd.merge(real_type, binning_type, how='outer').assign(method='binning-based').fillna(0)
    tval.append(
        pd.concat([
            a, b, c
    ]).assign(sample = sample, coverage = coverage))

    d = pd.merge(real_subtype, read_subtype, how='outer').assign(method='read-based').fillna(0)
    e = pd.merge(real_subtype, assembly_subtype, how='outer').assign(method='assembly-based').fillna(0)
    f = pd.merge(real_subtype, binning_subtype, how='outer').assign(method='binning-based').fillna(0)
    sval.append(
        pd.concat([
            d, e, f
    ]).assign(sample = sample, coverage = coverage))

    for i,j in zip([a,b,c], ['read-based', 'assembly-based', 'binning-based']):
        pearson = pearsonr(i.cnt, i.est)
        spearman = spearmanr(i.cnt, i.est)
        tstat.append([
            cityblock(i.cnt, i.est), 
            euclidean(i.cnt, i.est), 
            pearson.statistic, pearson.pvalue, 
            spearman.statistic, spearman.pvalue,
            j, sample, coverage, *pr(i)])

    for i,j in zip([d,e,f], ['read-based', 'assembly-based', 'binning-based']):
        pearson = pearsonr(i.cnt, i.est)
        spearman = spearmanr(i.cnt, i.est)
        sstat.append([
            cityblock(i.cnt, i.est), 
            euclidean(i.cnt, i.est), 
            pearson.statistic, pearson.pvalue, 
            spearman.statistic, spearman.pvalue,
            j, sample, coverage, *pr(i)])

In [5]:
pd.concat(tval).sort_values(['method', 'sample', 'coverage']).to_csv('fig3/tval.tsv', sep='\t', index=False)
pd.concat(sval).sort_values(['method', 'sample', 'coverage']).to_csv('fig3/sval.tsv', sep='\t', index=False)

pd.DataFrame(tstat, columns = ['l1', 'l2', 'corr', 'p', 'corr.spearman', 'p.spearman', 'method', 'sample', 'coverage', 'precision', 'recall']).sort_values(['method', 'sample', 'coverage']).to_csv('fig3/tstat.tsv', sep='\t', index=False)
pd.DataFrame(sstat, columns = ['l1', 'l2', 'corr', 'p', 'corr.spearman', 'p.spearman', 'method', 'sample', 'coverage', 'precision', 'recall']).sort_values(['method', 'sample', 'coverage']).to_csv('fig3/sstat.tsv', sep='\t', index=False)