In [None]:
from Bio import SeqIO
from collections import defaultdict
import pandas as pd
import re
bxy = dict()
with open('sarg2nr.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        bxy[record.seq] = record.description.split(' >')[0]
with open('sarg2env_nr.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.seq not in bxy:
            bxy[record.seq] = record.description.split(' >')[0]

res = []
with open('init.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        nr = bxy.get(record.seq)
        record.id = '|'.join(record.id.split('|')[:3] + [nr.split(' ')[0]])
        record.description = nr.split(' ',1)[-1]
        res.append(record)

with open('sarg_ref.fasta', 'w') as output_handle:
    SeqIO.write(res, output_handle, 'fasta')

In [None]:
%%bash
md5sum sarg_ref.fasta

In [None]:
rep = set()
ref = defaultdict(set)
rev = defaultdict(set)
rep_ids = set()
gene2family = dict()
with open('sarg_ref.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        header = record.description.split(' >')[0].split(' ',1)[1].rsplit(' [')[0].split('MULTISPECIES: ')[-1]
        ref[record.id.split('|')[2]].add(header)
        rev[header].add(record.id.split('|')[2])
        rep.add(record.seq)
        rep_ids.add(record.id.split('|')[-1])
        gene2family[record.id.split('|')[2]] = record.id.split('|')[1]

In [None]:
ids = set()
with open('sarg_raw.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.seq not in rep:
            ids.add(record.id)

In [None]:
remark = pd.read_table('remark.tsv', names = ['id', 'header', 'taxonomy', 'organisim', 'evidence', 'gs', 'source']).fillna('NA')
remark = remark[remark.id.isin(ids)]
idnset = set(remark[remark.evidence.str.contains('NA|Domain')].id)
idnset.update(set(remark[(remark.evidence.str.contains('TIGR') | (remark.source.str.contains('PF'))) & (remark.gs == 'NA')].id))

remark['accession'] = remark.evidence.str.split('.').str.get(0)
latest = set(remark.sort_values(['accession', 'evidence'], ascending=False).groupby('accession', as_index=False).first().evidence)
idnset.update(remark[~remark.evidence.isin(latest)].id.unique())

remark = remark[~remark.id.isin(idnset)]
remark['#'] = remark.groupby(['header', 'evidence']).transform('size')
id2header = remark.set_index('id').header.to_dict()

In [None]:
from collections import defaultdict
rmk = remark[['header', 'evidence', 'gs', 'source', '#']].drop_duplicates()
rmk['mapper'] = rmk.header.apply(lambda x: ' | '.join(sorted(rev.get(x, set()))))
rmk['unique'] = (rmk.mapper.str.count('\|') == 0) & (rmk.mapper!='')
rmk['valid'] = rmk.apply(lambda x: re.sub('^bla|\(|\)', '', x.mapper.lower()) in re.sub('\(|\)', '', x.header.lower()) 
                         if len(x.mapper)>4 else x.mapper.lower() in x.header.lower(), axis=1)
rmk.loc[rmk.header.str.contains('/'), 'valid'] = False

import json
with open('whitelist.json') as f:
    wl = json.load(f)

rmkk = rmk[~((rmk.unique) & (rmk.valid))].copy()
rmkk['reviewed'] = rmkk.header.apply(lambda x: wl.get(x, {}).get('reviewed', ''))
rmkk = rmkk.sort_values(['header', 'evidence', 'gs'])

wl = defaultdict(dict)
for _, i in rmkk.iterrows():
    wl[i.header]['reviewed'] = i.reviewed
    wl[i.header]['gene'] = i.mapper
    if 'remark' not in wl[i.header]:
        wl[i.header]['remark'] = []
    wl[i.header]['remark'].append(' | '.join([i.evidence, i.gs, i.source, str(i['#'])]))

wl = {x:y for x,y in wl.items() if x in set(rmkk.header)}
with open('whitelist.json', 'w') as f:
    json.dump(wl, f, sort_keys=True, indent=4)

print(rmkk.header.nunique())
reviewed = rmkk.set_index('header').reviewed.to_dict()

In [None]:
for i,j in rmkk[(rmkk.reviewed == '') | (rmkk.reviewed == '*') | (rmkk.reviewed.str.contains(' '))].groupby(['header', 'mapper', 'reviewed']):
    print(f'{i[0]}')
    print(f'gene: {i[1]}')
    print(f'reviewed: {i[2]}')

    for _, k in j.iterrows():
        print(k.evidence, k.gs, k.source, k['#'])

    print('\n')

In [None]:
## save
gene2family.update(
    {
         "aph(3'')*": 'aminoglycoside',
         'blaCIM': 'beta-lactam',
         'blaDYB': 'beta-lactam',
         'blaPNC': 'beta-lactam',
         'dfr*': 'trimethoprim',
         'erm*': 'macrolide-lincosamide-streptogramin',
         'lsa*': 'macrolide-lincosamide-streptogramin',
         'mec*': 'beta-lactam',
         'mph*': 'macrolide-lincosamide-streptogramin',
         'msr*': 'macrolide-lincosamide-streptogramin',
         'nim*': 'nitroimidazole',
         'rph*': 'rifamycin',
         'sal*': 'macrolide-lincosamide-streptogramin',
         'tet*': 'tetracycline',
         'vat*': 'macrolide-lincosamide-streptogramin',
         'vga*': 'macrolide-lincosamide-streptogramin',
    }
)
mapper = rmk.set_index('header').mapper.to_dict()
mapper.update(reviewed)

In [None]:
records = []
with open('sarg_raw.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.seq not in rep:
            gene = mapper.get(id2header.get(record.id))
            if gene is not None and gene != '*':
                record.id = 'SARG|' + gene2family.get(gene) + '|' + gene + '|' + record.id
                record.description = record.description.split(' ', 1)[-1].split(' >')[0]
                records.append(record)

with open('sarg_ext.fasta', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')