In [1]:
%%bash
cat sarg_ref.fa tmp/sarg_tmp.fa | seqkit shuffle -s0 --quiet > tmp/sarg_full.fa

In [2]:
# %%bash
# diamond blastp \
#     -q tmp/sarg_tmp.fa \
#     -d sarg_ref.fa \
#     --out tmp/hit_ref.txt \
#     --outfmt 6 qseqid sseqid nident qlen slen pident qcovhsp scovhsp bitscore evalue \
#     --id 90 --subject-cover 95 --query-cover 95 \
#     -k 0 --threads 48 --no-self-hits --masking 0 --quiet

# diamond blastp \
#     -q tmp/sarg_full.fa \
#     -d tmp/sarg_full.fa \
#     --out tmp/hit_full.txt \
#     --outfmt 6 qseqid sseqid nident qlen slen pident qcovhsp scovhsp bitscore evalue \
#     --id 90 --subject-cover 95 --query-cover 95 \
#     -k 0 --threads 48 --no-self-hits --masking 0 --quiet

In [3]:
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
import glob

In [4]:
aset = set()
with open('sarg_ref.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        aset.add(record.id)

qset = set()
with open('tmp/hit_ref.txt') as f:
    for line in f:
        ls = line.rstrip().split('\t')
        if ls[0].split('|')[2] != ls[1].split('|')[2] and round(float(ls[5])) >= 95:
            qset.add(ls[0])

r = []
with open('tmp/hit_full.txt') as f:
    for line in f:
        ls = line.rstrip().split('\t')
        if ls[0].split('|')[2] != ls[1].split('|')[2] and round(float(ls[5])) >= 95:
            if ls[0] not in aset and ls[1] not in aset:
                if ls[0] not in qset and ls[1] not in qset:
                    r.append(ls)

In [5]:
rr = pd.DataFrame(r)
rr['left'] = rr[0].str.split('|').str.get(2)
rr['right'] = rr[1].str.split('|').str.get(2)

In [6]:
ngen = {"tolC", "bacA", "blaMOX", "fosB", "kpnF", "vexH"}
nset = set(rr[((rr['left'].isin(ngen)) | (rr['left'].str.contains('\\*')))][0]) | set(rr[((rr['right'].isin(ngen)) | (rr['right'].str.contains('\\*')))][1])

In [7]:
rm = rr[(~rr[0].isin(nset)) & (~rr[1].isin(nset))]
rm.groupby(['left', 'right']).size()

Series([], dtype: int64)

In [8]:
for gene in sorted(rm.left.unique()):
    print(gene, len([x for x in (set(rm[0]) | set(rm[1])) if gene in x]))

In [9]:
assert len(rm) == 0, 'Make sure all cross-mapped cases are resolved.'
qset.update(nset)

In [10]:
records = []
with open('tmp/sarg_tmp.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.id not in qset:
            records.append(record)

with open('tmp/sarg_cleaned.fa', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

In [11]:
%%bash
mmseqs easy-cluster tmp/sarg_cleaned.fa tmp/sarg_ext tmp/TMP \
    --cov-mode 0 -c 0.95 --min-seq-id 0.95 -s 7.5 --cluster-reassign -v 0  > /dev/null 2>&1
seqkit sort -s --quiet tmp/sarg_ext_rep_seq.fasta -o sarg_ext.fa
cat sarg_ref.fa sarg_ext.fa | seqkit sort -s --quiet > tmp/sarg_clustered.fa

In [12]:
id, id_ref, id_ext = [], [], []
row = []
with open('tmp/sarg_clustered.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        id.append(record.id.split('|')[1:3])
        row.append(record.id.split('|', 3)[1:3] + [record.description.split(' ',1)[-1].split('MULTISPECIES: ')[-1].split(' [')[0]])

with open('sarg_ref.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        id_ref.append(record.id.split('|')[1:3])

with open('sarg_ext.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        id_ext.append(record.id.split('|')[1:3])

pd.DataFrame(id).groupby([0,1]).size().to_csv('misc/sarg.txt', sep='\t', header=None)
pd.DataFrame(id_ref).groupby([0,1]).size().to_csv('misc/sarg_ref.txt', sep='\t', header=None)
pd.DataFrame(id_ext).groupby([0,1]).size().to_csv('misc/sarg_ext.txt', sep='\t', header=None)

In [13]:
row = pd.DataFrame(row)
mg = row[row[0]=='multidrug'][[1,2]].drop_duplicates()
MFS = set(mg[mg[2].str.contains('MFS')][1]) | {'abcA', 'jefA', 'stp'}
ABC = set(mg[mg[2].str.contains('ABC')][1]) | {'satA', 'satB', 'horA'}
SMR = set(mg[mg[2].str.contains('SMR')][1])
MATE = set(mg[mg[2].str.contains('MATE')][1])
RND1 = set(mg[mg[1].str.contains('acr|mdt|tolC|hasF|vpoC')][1])
RND2 = set(mg[~mg[1].isin(MFS|ABC|SMR|MATE|RND1)][1])

records = []
with open('tmp/sarg_clustered.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        id = record.id.split('|')
        if id[2] in MFS:
            id[1] = 'multidrug@MFS'
        elif id[2] in ABC:
            id[1] = 'multidrug@ABC'
        elif id[2] in SMR:
            id[1] = 'multidrug@SMR'
        elif id[2] in MATE:
            id[1] = 'multidrug@MATE'
        elif id[2] in RND1:
            id[1] = 'multidrug@RND-1'
        elif id[2] in RND2:
            id[1] = 'multidrug@RND-2'
        else:
            id = id
        record.id = '|'.join(id)
        record.description = record.description.split(' ', 1)[-1]
        records.append(record)

with open('sarg.fa', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')