In [1]:
import pandas as pd
import regex as re
import glob
from Bio import SeqIO

In [2]:
records = []

ncbi = pd.read_table('reference/refgenes.tsv').fillna('NA')
ncbi['id'] = 'NCBI|' + (ncbi['Type'] + ':' + ncbi['Subtype']).str.lower() + '|' + ncbi['Class'].str.lower().str.replace(' ', '_') + '|' + ncbi['Gene family'].str.replace(' ', '_')
ncbi2id = dict()
for _, i in ncbi.iterrows():
    if i['RefSeq protein'] != 'NA':
        ncbi2id[i['RefSeq protein']] = i['id']
    elif i['GenBank protein'] != 'NA':
        ncbi2id[i['GenBank protein']] = i['id']

dups = set()
with open('reference/proteins.faa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.id in ncbi2id:
            record.id = ncbi2id.get(record.id) + '|' + record.id
            if record.id not in dups:
                dups.add(record.id)
                if '|amr:amr|' in record.id or '|stress:biocide|' in record.id:
                    records.append(record)
            else:
                print(f'dups: {record.id}')


card = pd.read_table('reference/aro_index.tsv').fillna('NA')
card['id'] = (card['Drug Class'].str.lower().str.replace(' antibiotic', '') + '|' + card['CARD Short Name']).str.replace(' ', '_')
card2id = card.set_index('ARO Accession').id.to_dict()

dups = set()
for file in glob.glob('reference/protein*.fasta'):
    with open(file) as handle:
        for record in SeqIO.parse(handle, 'fasta'):
            ARO = re.search('ARO:[0-9]+', record.description).group()
            record.id = 'CARD|' + file.split('fasta_')[1].split('.')[0].replace('protein_', '').replace('_model', '') + '|' + card2id.get(ARO) + '|' + record.id.split('|')[-3]
            if record.id not in dups:
                dups.add(record.id)
                if '|homolog|' in record.id:
                    records.append(record)
            else:
                print(record.id)

with open('reference/sequence.fasta', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

dups: NCBI|amr:point|pleuromutilin|eat(A)|WP_002296175.1
dups: NCBI|amr:point|phenicol|fexA|WP_015585966.1
dups: NCBI|amr:point|glycopeptide|vanR-Cd|WP_003436401.1
dups: NCBI|amr:point|glycopeptide|vanS-Cd|WP_011861275.1
