# Fig 4. Performance evaluation with sequenced mock communities

####  Dependencies:
    - argo==0.1.0
    - pandas

#### Inputs:
    - fig4/data/*.fa (mock communities d6330, d6331, msa1003, and msa2006, quality-controlled with porechop '--discard_middle' & nanoq '-q 10 -l 1000', non-prokaryotic/unmapped reads removed with minimap2 + seqkit)

In [1]:
# %%bash
# mkdir -p fig4/tmp
# argo fig4/data/*.fa -d db/argo -o fig4/tmp --skip-clean -t 48 -z 0 --plasmid

In [2]:
import pandas as pd
import re
from scipy.stats import pearsonr

def pr(df):
    df = df[df.species!='unclassified']
    TP = len(df[(df['est']!=0) & (df['cnt']!=0)])
    TPFP = len(df[df['est']!=0])
    TPFN = len(df[df['cnt']!=0])
    precision, recall = TP/TPFP, TP/TPFN
    return precision, recall

In [3]:
p = []
r = []
folder = 'fig4/tmp'
for filename in ['d6300', 'd6331', 'msa1003', 'msa2006']:
    est = pd.read_table(f'{folder}/{filename}.sarg.tsv')
    est['species'] = est['lineage'].str.split(';').str.get(-1)
    est['species'] = est.species.apply(lambda x: re.sub('_[A-Z]','',x))
    est['est'] = est['abundance']
    est = est[est.species!='unclassified']
    est = est.groupby(['species', 'subtype', 'genome']).est.sum().reset_index()

    ## fixed according to gtdbtk's taxonomic assignments
    real = pd.read_table(f'sarg/{filename}.sarg.tsv')
    real.species = real.species.apply(lambda x: {
        "Bacillus subtilis": "Bacillus spizizenii",
        "Lactobacillus fermentum": "Limosilactobacillus fermentum",
        "Schaalia odontolytica": "Pauljensenia odontolytica",
        "Clostridium perfringens": "Sarcina perfringens",
    }.get(x,x)).apply(lambda x: re.sub('_[A-Z]','',x))
    
    if filename == 'msa1003':
        real.species = real.species.apply(lambda x: {
        "Pseudomonas aeruginosa": "Pseudomonas paraeruginosa"}.get(x,x))

    if filename == 'd6331':
        real.species = real.species.apply(lambda x: {
        "Fusobacterium nucleatum": "Fusobacterium animalis"}.get(x,x))
    
    real = real.groupby(['species', 'subtype']).cnt.sum().reset_index()
    df = pd.merge(real, est, how='outer').fillna(0)
    p.append([*pr(df), filename, 'subtype'])
    r.append([pearsonr(df.cnt, df.est).statistic, pearsonr(df.cnt, df.est).pvalue, filename, 'subtype'])
    
    df.to_csv(f'fig4/{filename}.subtype.tsv', index=False, sep='\t')

In [4]:
for filename in ['d6300', 'd6331', 'msa1003', 'msa2006']:
    est = pd.read_table(f'{folder}/{filename}.sarg.tsv')
    est['species'] = est['lineage'].str.split(';').str.get(-1)
    est['species'] = est.species.apply(lambda x: re.sub('_[A-Z]','',x))
    est['est'] = est['abundance']
    est = est[est.species!='unclassified']
    est = est.groupby(['species', 'type', 'genome']).est.sum().reset_index()

    ## fixed according to gtdbtk's taxonomic assignments
    real = pd.read_table(f'sarg/{filename}.sarg.tsv')
    real.species = real.species.apply(lambda x: {
        "Bacillus subtilis": "Bacillus spizizenii",
        "Lactobacillus fermentum": "Limosilactobacillus fermentum",
        "Schaalia odontolytica": "Pauljensenia odontolytica",
        "Clostridium perfringens": "Sarcina perfringens",
    }.get(x,x)).apply(lambda x: re.sub('_[A-Z]','',x))
    
    if filename == 'msa1003':
        real.species = real.species.apply(lambda x: {
        "Pseudomonas aeruginosa": "Pseudomonas paraeruginosa"}.get(x,x))

    if filename == 'd6331':
        real.species = real.species.apply(lambda x: {
        "Fusobacterium nucleatum": "Fusobacterium animalis"}.get(x,x))
    
    real = real.groupby(['species', 'type']).cnt.sum().reset_index()
    df = pd.merge(real, est, how='outer').fillna(0)
    p.append([*pr(df), filename, 'type'])
    r.append([pearsonr(df.cnt, df.est).statistic, pearsonr(df.cnt, df.est).pvalue, filename, 'type'])
    
    df.to_csv(f'fig4/{filename}.type.tsv', index=False, sep='\t')

In [5]:
pd.DataFrame(p, columns = ['p', 'r', 'filename', 'type']).to_csv('fig4/pr.tsv', sep='\t', index=False)

In [6]:
pd.DataFrame(r, columns = ['r', 'p', 'filename', 'type']).to_csv('fig4/corr.tsv', sep='\t', index=False)