# Fig 5. Application of Argo to human and non-human primate fecal samples

####  Dependencies:
    - argo==0.1.0
    - pandas
    - biopython

#### Inputs:
    - fig5/data/*/*.fa (329 metagenomic samples, quality controlled with porechop '--discard_middle' & nanoq '-q 10 -l 1000')

In [1]:
# %%bash
# mkdir -p fig5/tmp
# argo fig5/data/*/*.fa -d db/argo -o fig5/tmp --skip-clean -t 48 -z 0 --plasmid -k db/kraken-pluspf

# python -c "
# import os
# import json
# import glob
# import pandas as pd
# from Bio import SeqIO

# for file in glob.glob('fig5/data/*/*.fa'):
#     filename = os.path.basename(file).split('.fa')[0]
#     project = file.split('/')[-2]
#     with open(f'fig5/tmp/{filename}.json') as f:
#         ff = json.load(f)
#     nset = {i for i,j in ff.items() if j['remark'] == 'putatively non-prokaryotic'}

#     cnt = 0
#     cntt = 0
#     with open(file, 'rt') as handle:
#         for record in SeqIO.parse(handle, 'fasta'):
#             cntt += len(record.seq)
#             if record.id in nset:
#                 cnt += len(record.seq)
#     pd.DataFrame([[project, filename, cntt, cnt]], columns = ['project', 'accession', 'total_bases', 'negative_bases']).to_csv(f'fig5/tmp/{filename}.bases', index=False, sep='\t')
# "

In [2]:
import pandas as pd
import glob
import json
import os
from tqdm import tqdm
import re

import numpy as np

In [3]:
f2gc = dict()
f2div = dict()

stats = []
for file in tqdm(glob.glob('fig5/tmp/*.tsv')):
    if 'sarg' not in file:
        with open(file) as f:
            df = pd.read_table(file)
            filename = os.path.basename(file).split('.')[0]
            stats.append(df.assign(accession = filename))
            f2gc[filename] = df['copy'].sum()
            f2div[filename] =  sum (df['abundance'].map(np.log) * df['abundance'] * -1)

stats = pd.concat(stats)

bases = []
for file in tqdm(glob.glob('fig5/tmp/*.bases')):
    df = pd.read_table(file)
    df['bases'] = (df.total_bases - df.negative_bases) / 1e6
    bases.append(df.drop(['total_bases', 'negative_bases'], axis=1))

stats['genus'] = stats.genus.map(lambda x: re.sub('_[A-Z]', '', x))

bases = pd.concat(bases)
bases['gc'] = bases.accession.map(f2gc)
bases['ags'] = bases['bases'] / bases['gc']

bases['country'] = bases.project.map({
    'PRJDB8879': 'KR',
    'PRJNA798244': 'KR',
    'PRJEB49168': 'SG',
    'PRJNA763692': 'CN',
    'PRJNA820119': 'CN',
    'PRJNA842693': 'NH',
    'PRJNA929328': 'DE',
    'PRJNA940499': 'US',
    'PRJNA508395': 'US',
})

args = []
for file in tqdm(glob.glob('fig5/tmp/*.sarg.tsv')):
    df = pd.read_table(file).assign(accession = file.split('.sarg')[0].split('/')[-1])
    args.append(df)

args = pd.concat(args)
args['genus'] = args.lineage.str.split(';').str.get(-2).fillna('').map(lambda x: re.sub('_[A-Z]', '', x))
args['species'] = args.lineage.str.split(';').str.get(-1).map(lambda x: re.sub('_[A-Z]', '', x))

100%|████████████████████████████████████████| 658/658 [00:01<00:00, 399.41it/s]
100%|████████████████████████████████████████| 329/329 [00:00<00:00, 419.90it/s]
100%|████████████████████████████████████████| 329/329 [00:01<00:00, 325.87it/s]


#### Fig. 5a

In [4]:
df = pd.merge(pd.concat([
    stats.groupby(['accession', 'genus'], as_index=False)['copy'].sum(),
    stats.groupby('accession', as_index=False)['copy'].sum().assign(genus='all')
]), pd.concat([
    args.groupby(['accession', 'genus'], as_index=False)['copy'].sum(),
    args.groupby('accession', as_index=False)['copy'].sum().assign(genus='all')
]).rename(columns = {'copy': 'arg_copy'}))

df['cpg'] = df['arg_copy'] / df['copy']
df = pd.merge(bases, df)
df['group'] = 'commensal'
df.loc[df.genus.isin({'Streptococcus', 'Escherichia'}), 'group'] = 'pathogen'
df.loc[df.genus.isin({'all'}), 'group'] = 'all'

df.sort_values(['country', 'accession']).to_csv('fig5/genus_copies.tsv', sep='\t', index=False)

#### Fig. 5b–c

In [5]:
df = bases.copy()
df['div'] = df.accession.map(f2div)
df['arg_copies'] = df.accession.map(args.groupby('accession')['copy'].sum().to_dict())
df['cpg'] = df['arg_copies'] / df['gc']
df.sort_values(['country', 'accession']).to_csv('fig5/total_copies.tsv', sep='\t', index=False)

#### Fig. 5d–e

In [6]:
df = pd.merge(
    bases[['country', 'accession']],
    stats.groupby(['accession', 'genus']).abundance.sum().unstack().fillna(0).reset_index()
)
df.sort_values(['country', 'accession']).to_csv('fig5/abun.tsv', sep='\t', index=False)

df = pd.merge(
    stats.groupby(['accession'], as_index=False)['copy'].sum(),
    args.groupby(['accession', 'type'], as_index=False)['copy'].sum().rename(columns = {'copy': 'arg.copy'})
)
df['cpg'] = df['arg.copy'] / df['copy']
df = pd.merge(df, bases[['accession', 'country']])

df = df.set_index(['country', 'accession', 'type']).drop(['copy', 'arg.copy'], axis=1).cpg.unstack().reset_index().fillna(0)
df.sort_values(['country', 'accession']).to_csv('fig5/abun2.tsv', sep='\t', index=False)

#### fig. 5f–g

In [7]:
ec = args[args.species.str.contains('Escherichia coli')].copy()
cp = pd.merge(stats, bases)
cp = cp[cp.species.str.contains('Escherichia coli')].groupby('country')['copy'].sum().reset_index()

df = pd.merge(
    pd.merge(ec, bases).groupby(['country', 'type'], as_index=False)['copy'].sum().rename(columns = {'copy': 'arg.copy'}),
    pd.merge(ec, bases).groupby(['country', 'type'], as_index=False)['subtype'].nunique().rename(columns = {'subtype': 'n'}))
df = pd.merge(df, cp)

df['cpg'] = df['arg.copy'] / df['copy']
df.sort_values('country').to_csv('fig5/ecoli.tsv', sep='\t', index=False)

df = pd.merge(ec, bases).groupby(['country', 'type', 'subtype'], as_index=False)['copy'].sum().rename(columns = {'copy': 'arg.copy'})
df = pd.merge(df, cp)

df['cpg'] = df['arg.copy'] / df['copy']
df.sort_values('country').to_csv('fig5/ecoli2.tsv', sep='\t', index=False)

#### fig. 5h

In [8]:
df = pd.merge(
    stats.groupby(['accession', 'species'], as_index=False)['copy'].sum(),
    args.groupby(['accession', 'subtype', 'species'], as_index=False)['copy'].sum().rename(columns = {'copy': 'arg.copy'})
)
df['cpg'] = df['arg.copy'] / df['copy']
df = df[df['copy'] >=1]
df['id'] = df['species'] + '@' + df['subtype']
df = df[df.subtype.str.contains('sul|mcr')]
df = pd.merge(df, bases[['accession', 'country']])

df = df.set_index(['accession','id']).drop(['species', 'copy', 'subtype', 'arg.copy'],axis=1).cpg.unstack().fillna(0)
df.corr(method='spearman').sort_index().to_csv('fig5/cor.tsv', sep='\t')