In [1]:
import json5
import glob
import pandas as pd
import regex as re
from Bio import SeqIO
from collections import defaultdict

In [2]:
## parse sarg.json
added = {}
discarded = set()

replaced_accs = set()
replaced_seqs = dict()
replaced_genes = dict()

with open('sarg.json', 'r') as f:
    for i, j in json5.load(f).items():
        if i != 'discarded':
            for k, l in j.items():
                if k == 'discarded':
                    discarded.update(l)
                if k == 'changed':
                    replaced_seqs.update({z: x for x, y in l.items() for z in y if isinstance(y, list)})
                    replaced_accs.update({z.split('|')[-1] for x, y in l.items() for z in y if isinstance(y, list)})
                    replaced_genes.update({x: y for x, y in l.items() if not isinstance(y, list)})
                if k == 'added':
                    for m, n in l.items():
                        added[m] = 'REF|' + i + '|' + n + '|' + m
        else:
            discarded.update(j)

In [3]:
records = []
dups = set()

## parse NDARO
ncbi = pd.read_table('reference/refgenes.tsv').fillna('NA')
ncbi['accession'] = ncbi.apply(lambda x: x['RefSeq protein'] if x['RefSeq protein']!='NA' else x['GenBank protein'], axis=1)
ncbi = ncbi[(ncbi.groupby('accession').transform('size') == 1) & (ncbi.Subtype.isin(['BIOCIDE', 'AMR']))]
ncbi['id'] = 'NCBI|' + (ncbi['Type'] + ':' + ncbi['Subtype']).str.lower() + '|' + ncbi['Class'].str.lower().str.replace(' ', '_') + '|' + ncbi['Gene family'].str.replace(' ', '_')
acc2id = ncbi.set_index('accession')['id'].to_dict()

with open('reference/proteins.faa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.id in acc2id:
            record.id = acc2id.get(record.id) + '|' + record.id
            if record.id not in discarded and record.seq not in dups:
                dups.add(record.seq)
                records.append(record)

## parse CARD
card = pd.read_table('reference/aro_index.tsv').fillna('NA')
card['id'] = (card['Drug Class'].str.lower().str.replace(' antibiotic', '') + '|' + card['CARD Short Name']).str.replace(' ', '_')
acc2id = card.set_index('ARO Accession')['id'].to_dict()

for file in glob.glob('reference/protein_fasta_protein_homolog_model.fasta'):
    with open(file) as handle:
        for record in SeqIO.parse(handle, 'fasta'):
            record.id = 'CARD|homolog|' + acc2id.get(re.search('ARO:[0-9]+', record.description).group()) + '|' + record.id.split('|')[-3]
            if record.id not in discarded and record.seq not in dups:
                dups.add(record.seq)
                records.append(record)

In [4]:
%%bash
seqkit sort -s reference/reference.fasta -o reference/reference.fasta --quiet

In [5]:
unused = []
with open('reference/reference.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.seq not in dups:
            dups.add(record.seq)
            if record.id in added:
                record.id = added.get(record.id)
                records.append(record)
            else:
                unused.append(record.id)
                print(f'unused: {record.id}')

with open('reference/sequence.fasta', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

In [6]:
records_renamed = []
for record in records:
    if 'REF|' in record.id:
        if record.id in replaced_seqs:
            record.id = replaced_seqs.get(record.id)
        else:
            record.id = '@'.join(record.id.split('|')[1:3])
    elif 'NCBI|' in record.id or 'CARD|' in record.id:
        if record.id in replaced_seqs:
            record.id = replaced_seqs.get(record.id)
        else:
            if '|stress:biocide|' in record.id:
                record.id = 'biocide@' + record.id.split('|')[3]
            else:
                record.id = '@'.join(record.id.split('|')[2:4])

    gene, id = record.id.split('@')[-1], record.id
    if record.name not in replaced_accs:
        for k, v in replaced_genes.items():
            id = re.sub(k, v, id)

    if 'sp|' in record.name or 'tr|' in record.name or 'gb|' in record.name:
        record.name = record.name.split('|')[1]

    record.id = f"SARG|{id.replace('@', '|')}|{record.name}"
    record.description = record.description.split(' >')[0]    
    if id.split('@')[-1] not in discarded and id.split('@')[-2] not in discarded and gene not in discarded:
        records_renamed.append(record)

In [7]:
with open('misc/init.fa', 'w') as output_handle:
    SeqIO.write(records_renamed, output_handle, 'fasta')

In [8]:
%%bash
seqkit sort -s misc/init.fa -o misc/init.fa --quiet