In [1]:
import re
import pandas as pd
import glob

from Bio import SeqIO
from collections import defaultdict

In [2]:
species = 'Bacillus subtilis subsp. subtilis str. 168'
cnt = defaultdict(lambda: 0)
cpx = {}
cre = {}
for file in glob.glob(f'assembly/{species}/*/*.faa'):
    with open(file) as handle:
        for record in SeqIO.parse(handle, 'fasta'):
            if 'WP_' in record.id:
                cnt[record.seq] += 1
                cpx[record.seq] = record.description
                cre[record.id] = record

In [3]:
dups = set()
with open(f'tmp/subject.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        dups.add(record.seq)

lines = []
for i,j in cpx.items():
    if re.search('multidrug|resistance|efflux|outer membrane subunit', j, re.IGNORECASE):
        if not re.search('repressor|regulator|accessory|activator|sensor|arsenite|arsenic|iron|copper|heat|nickel|zinc|heavy metal', j):
            if i not in dups:
                dups.add(i)

                gene = j.split(' [')[0].split(' ')[-1]
                gene = gene[0].lower() + gene[1:]

                accession = j.split(' ')[0]
                description = j
                count = cnt.get(i)
                lines.append([gene, accession, description, count])

lines = pd.DataFrame(lines).sort_values([0, 3, 1], ascending=[True, False, True]).groupby(0, as_index=False).first()
for _, line in lines.iterrows():
    print(f'"{line[1]}": "{line[0]}", // {line[2]} | cnt: {line[3]}')

with open('tmp/protein.fa', 'w') as output_handle:
    SeqIO.write([cre.get(x) for x in lines[1].unique()], output_handle, 'fasta-2line')

"WP_009969035.1": "azlC", // WP_009969035.1 MULTISPECIES: azaleucine resistance protein AzlC [Bacillales] | cnt: 11
"WP_003227840.1": "chrA", // WP_003227840.1 MULTISPECIES: chromate resistance efflux protein ChrA [Bacillales] | cnt: 12
"WP_003227843.1": "chrB", // WP_003227843.1 MULTISPECIES: chromate efflux transporter subunit ChrB [Bacillales] | cnt: 12
"WP_003245812.1": "crcB", // WP_003245812.1 MULTISPECIES: fluoride efflux transporter CrcB [Bacillales] | cnt: 12
"WP_010886645.1": "peptide", // WP_010886645.1 MULTISPECIES: tetracycline resistance efflux system leader peptide [Bacillus] | cnt: 5
"WP_003226779.1": "protein", // WP_003226779.1 MULTISPECIES: toxic anion resistance protein [Bacillales] | cnt: 12
"WP_003242663.1": "srfP", // WP_003242663.1 MULTISPECIES: surfactin resistance protein SrfP [Bacillales] | cnt: 11
"WP_003233430.1": "subunit", // WP_003233430.1 MULTISPECIES: HlyD family efflux transporter periplasmic adaptor subunit [Bacillales] | cnt: 12
"WP_078081400.1": "t

In [4]:
%%bash
diamond blastp -q blastrule/proteins.fasta -d tmp/subject.fa -k 1  --id 90 --outfmt 6 qseqid > tmp/blastrule.txt

In [5]:
cpx = dict()
with open(f'blastrule/proteins.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        cpx[record.id] = record.description

dups = set(pd.read_table('tmp/blastrule.txt', header=None)[0])
lines = []
for i,j in cpx.items():
    if re.search('multidrug|resistance|efflux|outer membrane (subunit|channel)| tolC', j, re.IGNORECASE):
        if not re.search('repressor|regulator|accessory|activator|sensor|inhibitory', j):
            if not re.search('hydroperoxide|pyoverdine|peptide|glutathione|SAV1866|HP1184|Rv0191|toxic|auxin|toxin|invertase|lysostaphin|nucleoside|acid|sugar|complement|6-N-hydroxylaminopurine|manganese|heat|endopeptidase|arsenite|arsenic|iron|serum|cadmium|chromate|nickel|copper|Cu\(\+\)|Ni\(II\)|metal|tellurite|tellurium|zinc|chlorine|cation|stress|radiation|colicin|microcin', j):
                if i not in dups:
                    dups.add(i)
    
                    gene = j.split(' [')[0].split(' ')[-1]
                    gene = gene[0].lower() + gene[1:]

                    accession = j.split(' ')[0]
                    description = j.split(' ', 1)[-1]

                    if accession not in {
                        'gi|490073174|ref|WP_003975349.1|', # vanS
                        'gi|497574828|ref|WP_009889012.1|', # vanZ
                        'gi|489108589|ref|WP_003018447.1|', # silC
                        'gi|446040665|ref|WP_000118520.1|', # sugE
                        'gi|497384507|ref|WP_009698720.1|', # vmrA
                        'gi|1129486601|ref|WP_075443138.1|', # cmeD
                    }:
                        lines.append([gene, accession, description])

lines = pd.DataFrame(lines).sort_values([0, 1], ascending=[True, True])
print('remaining:', len(lines))
for _, line in lines.iterrows():
    print(f'"{line[1]}": "{line[0]}", // {line[2]}')

remaining: 3
"gi|489509033|ref|WP_003413897.1|": "permease", // MULTISPECIES: fluoroquinolones efflux ABC transporter permease [Mycobacterium tuberculosis complex]
"gi|489995739|ref|WP_003898775.1|": "permease", // MULTISPECIES: multidrug efflux ABC transporter permease [Mycobacterium tuberculosis complex]
"gi|490003633|ref|WP_003906541.1|": "protein", // multidrug efflux ABC transporter ATP-binding protein [Mycobacterium tuberculosis]
