In [1]:
from collections import defaultdict
from Bio import SeqIO

import regex as re
import pandas as pd
import json

In [2]:
rep = set()
ref = set()
rev = defaultdict(set)
tset = set()
with open('sarg_ref.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        header = record.description.split(' >')[0].split(' ',1)[1].rsplit(' [')[0].split('MULTISPECIES: ')[-1]
        rev[header].add('@'.join(record.id.split('|')[1:3]))
        rep.add(record.seq)
        ref.add(record.id.split('|')[-1])
        tset.add('@'.join(record.id.split('|')[1:3]))

rev = {x: ' | '.join(sorted(y)) for x, y in rev.items()}

In [3]:
remark = pd.read_table('tmp/remark.tsv', names = ['accession', 'header', 'taxonomy', 'organisim', 'evidence', 'symbol', 'identifier']).fillna('NA')

## bugfix
## https://www.ncbi.nlm.nih.gov/genome/annotation_prok/evidence/NBR012193/
remark.loc[remark.evidence == 'NBR012193', 'header'] = 'quaternary ammonium compound efflux SMR transporter EmrC'

## clean the others
remark = remark[~remark.accession.isin(remark[remark.evidence.str.contains('NA|Domain')].accession)]
remark = remark[~remark.accession.isin(remark[(remark.evidence.str.contains('TIGR') | (remark.identifier.str.contains('PF'))) & (remark.symbol == 'NA')].accession)]
remark = remark[~remark.accession.isin(ref)]
remark['cnt'] = remark.groupby(['header', 'evidence']).transform('size')
a2h = remark.set_index('accession').header.to_dict()

In [4]:
rmk = remark[['header', 'evidence', 'symbol', 'identifier', 'cnt']].drop_duplicates()
rmk['mapped'] = rmk.header.map(rev).fillna('')
rmk['unique'] = (rmk.mapped.str.count('\\|') == 0) & (rmk.mapped != '')
rmk['valid'] = rmk.apply(lambda x: re.sub('^bla|\\(|\\)', '', x.mapped.split('@')[-1].lower()) in re.sub('\\(|\\)', '', x.header.lower()) if len(x.mapped.split('@')[-1]) > 4 else x.mapped.lower().split('@')[-1] in x.header.lower(), axis=1)
rmk.loc[rmk.header.str.contains('/'), 'valid'] = False

In [5]:
with open('misc/whitelist.json') as f:
    wl = json.load(f)

rmkk = rmk[~((rmk.unique) & (rmk.valid))].copy()
rmkk['reviewed'] = rmkk.header.apply(lambda x: wl.get(x, {}).get('reviewed', ''))
rmkk = rmkk.sort_values(['header', 'evidence', 'symbol'])

## easy cases
bla = rmkk[(rmkk.symbol.str.contains('^bla', regex=True)) & (rmkk.symbol.str.replace('^bla', '', regex=True) == rmkk.header.str.split(' |-').str.get(0)) & (~rmkk.symbol.str.contains('CMA|CSA|CFE|LAT|LCR|NPS', regex=True))].copy()
bla['gene'] = 'beta-lactam@' + bla.symbol
mcr = rmk[rmk.header.str.contains('^MCR', regex=True)].copy()
mcr['gene'] = 'colistin@' + mcr.header.str.split(' |-related').str.get(0).str.lower()

rmkk = rmkk[~(rmkk.header.isin(bla.header) | (rmkk.header.isin(mcr.header)))]
wl = defaultdict(dict)
for _, i in rmkk.iterrows():
    wl[i.header]['reviewed'] = i.reviewed
    wl[i.header]['gene'] = i.mapped
    if 'remark' not in wl[i.header]:
        wl[i.header]['remark'] = []
    wl[i.header]['remark'].append(' | '.join([i.evidence, i.symbol, i.identifier, str(i['cnt'])]))

wl = {x:y for x,y in wl.items() if x in set(rmkk.header)}
with open('misc/whitelist.json', 'w') as f:
    json.dump(wl, f, sort_keys=True, indent=4)

print('unreviewed:', rmkk.header.nunique())
assert sum(rmkk.reviewed == '') == 0, 'Make sure all uncertain cases are reviewed in <misc/whitelist.json>.'

rev.update(rmkk.set_index('header').reviewed.to_dict())
rev.update(bla.set_index('header').gene.to_dict())
rev.update(mcr.set_index('header').gene.to_dict())

unreviewed: 283


In [6]:
for i,j in rmkk[(rmkk.reviewed == '') | (rmkk.reviewed.str.contains(' '))].groupby(['header', 'mapped', 'reviewed']):
    print(f'{i[0]}')
    print(f'gene: {i[1]}')
    print(f'reviewed: {i[2]}')

    for _, k in j.iterrows():
        print(k.evidence, k.symbol, k.identifier, k['cnt'])

    print('\n')

In [7]:
records = []
ids = set()
ads = set(remark.accession)
i2n = dict()
sset = set()
with open('tmp/sarg_raw.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.seq not in rep:
            header = record.description.split(' >')[0].split(' ',1)[1].rsplit(' [')[0].split('MULTISPECIES: ')[-1]
            gene = rev.get(header)
            if gene is not None and gene != '' and gene != '*':
                if record.id in ads:
                    ids.add(record.id)
                    i2n[record.id] = header
                    record.id = 'SARG|' + gene.replace('@', '|') + '|' + record.id
                    record.description = record.description.split(' ', 1)[-1].split(' >')[0]
                    records.append(record)
                    sset.add(gene)
                    if '|' in gene:
                        print(gene, 'header')

with open('tmp/sarg_tmp.fa', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

remark['name'] = remark.accession.map(i2n).fillna('')
remark['type'] = remark.name.map(rev).str.split('@').str.get(0).fillna('')
remark['subtype'] = remark.name.map(rev).str.split('@').str.get(1).fillna('')
remark[remark.accession.isin(ids)].groupby(['type', 'subtype', 'name', 'evidence', 'symbol', 'identifier'], as_index=False).size().rename({'size': 'cnt'}, axis=1).to_csv('misc/evidence.tsv', sep='\t', index=False)

In [8]:
sset - tset

{"aminoglycoside@aph(3'')*",
 'aminoglycoside@rmt*',
 'beta-lactam@blaASDC',
 'beta-lactam@blaCSR',
 'beta-lactam@mec*',
 'macrolide-lincosamide-streptogramin@erm*',
 'macrolide-lincosamide-streptogramin@lsa*',
 'macrolide-lincosamide-streptogramin@mph*',
 'macrolide-lincosamide-streptogramin@msr*',
 'macrolide-lincosamide-streptogramin@sal*',
 'macrolide-lincosamide-streptogramin@vat*',
 'macrolide-lincosamide-streptogramin@vga*',
 'nitroimidazole@nim*',
 'rifamycin@rph*',
 'tetracycline@tet*',
 'trimethoprim@dfr*'}