In [None]:
%%bash
cat sarg_ext.fasta sarg_ref.fasta | seqkit shuffle -s0 > sarg_full.fasta

In [None]:
from Bio import SeqIO
a, b = [], []
with open('sarg_full.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if '*' in record.id:
            a.append(record)
        else:
            b.append(record)

with open('sarg_full_a.fasta', 'w') as output_handle:
    SeqIO.write(a, output_handle, 'fasta')

with open('sarg_full_b.fasta', 'w') as output_handle:
    SeqIO.write(b, output_handle, 'fasta')

In [None]:
# %%bash
# diamond blastp -q sarg_full_a.fasta \
#     -d sarg_full_b.fasta --out self_alignment/cross.txt \
#     --outfmt 6 qseqid sseqid nident qlen slen pident qcovhsp scovhsp bitscore evalue \
#     --id 90 --subject-cover 90 --query-cover 90 \
#     -k 0 --threads 48 --no-self-hits --masking 0

In [None]:
# seqkit split2 -p 10 --force sarg_full.fasta 

# mkdir -p self_alignment
# for file in sarg_full.fasta.split/*.fasta
# do
#     filename=${file%.fasta}
#     filename=${filename##*/}
#     diamond blastp -q $file \
#         -d sarg_full.fasta --out self_alignment/$filename.txt \
#         --outfmt 6 qseqid sseqid nident qlen slen pident qcovhsp scovhsp bitscore evalue \
#         --id 95 --subject-cover 95 --query-cover 95 \
#         -k 0 --threads 48 --no-self-hits --masking 0
# done

In [None]:
import pandas as pd
from tqdm import tqdm
from Bio import SeqIO
import glob
r = []

for file in tqdm(glob.glob('self_alignment/sarg*.txt')):
    with open(file) as f:
        for line in f:
            ls = line.rstrip().split('\t')
            if ls[0].split('|')[2] != ls[1].split('|')[2]:
                r.append(ls)

In [None]:
rr = pd.DataFrame(r)
rr['left'] = rr[0].str.split('|').str.get(2)
rr['right'] = rr[1].str.split('|').str.get(2)

In [None]:
ngene = {'mdtI', 'mdtJ', 'tolC', 'mexC', 'blaSHV', 'blaPLA', 'blaMOX', 'blaACT', 'bacA', 'fosB', 'vexH'}
nset = set(rr[((rr['left'].isin(ngene)) | (rr['left'].str.contains('\*')))][0]) | set(rr[((rr['right'].isin(ngene)) | (rr['right'].str.contains('\*')))][1])

In [None]:
rrr = rr[(~rr[0].isin(nset)) & (~rr[1].isin(nset))]
rrr.groupby(['left', 'right']).size()

In [None]:
for gene in sorted(rrr.left.unique()):
    print(gene, len([x for x in (set(rrr[0]) | set(rrr[1])) if gene in x]))

In [None]:
nset.update(set(pd.read_table('self_alignment/cross.txt', header=None)[0]))

In [None]:
records = []
id_ref, id_ext = [], []
with open('sarg_ref.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.id not in nset:
            records.append(record)
            id_ref.append(record.id.split('|')[1:3])

with open('sarg_ref_clean.fasta', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

records = []
with open('sarg_ext.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.id not in nset:
            records.append(record)
            id_ext.append(record.id.split('|')[1:3])

with open('sarg_ext_clean.fasta', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

pd.DataFrame(id_ref).groupby([0,1]).size().to_csv('sarg_ref.txt', sep='\t', header=None)
pd.DataFrame(id_ext).groupby([0,1]).size().to_csv('sarg_ext.txt', sep='\t', header=None)

In [None]:
%%bash
mmseqs easy-cluster sarg_ref_clean.fasta sarg_ref TMP \
    --cov-mode 0 -c 0.995 --min-seq-id 0.995 -s 7.5 --cluster-reassign
mmseqs easy-cluster sarg_ext_clean.fasta sarg_ext TMP \
    --cov-mode 0 -c 0.95 --min-seq-id 0.95 -s 7.5 --cluster-reassign
rm -rf TMP

In [None]:
c = pd.concat([
    pd.read_table('sarg_ref_cluster.tsv', header=None),
    pd.read_table('sarg_ext_cluster.tsv', header=None)
])
c['left'] = c[0].str.split('|').str.get(-2)
c['right'] = c[1].str.split('|').str.get(-2)

c['id'] = c.left == c.right
c[0] = c[0].str.split('|').str.get(-1)
c[1] = c[1].str.split('|').str.get(-1)

print(c[c.id==False].groupby(['left',  'right'], as_index=False).size().tail(60))

In [None]:
%%bash
cat sarg_ref_rep_seq.fasta sarg_ext_rep_seq.fasta | seqkit sort -s -> sarg_cluster.fasta 

In [None]:
a = []
with open('sarg_cluster.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        header = record.description.split(' ',1)[-1].split('MULTISPECIES: ')[-1].split(' [')[0]
        a.append(record.id.split('|', 3)[1:3] + [header])
pd.DataFrame(a).groupby([0,1]).size().to_csv('sarg.txt', sep='\t', header=None)

In [None]:
zz = pd.DataFrame(a)
mg = zz[zz[0]=='multidrug'][[1,2]].drop_duplicates()
MFS = set(mg[mg[2].str.contains('MFS')][1]) | {'abcA', 'jefA', 'stp'}
ABC = set(mg[mg[2].str.contains('ABC')][1])
SMR = set(mg[mg[2].str.contains('SMR')][1])
MATE = set(mg[mg[2].str.contains('MATE')][1])

mgg = mg[~mg[1].isin(MFS|ABC|SMR|MATE)]
# MFP = set(mgg[mgg[2].str.contains('periplasmic')][1]) | {'axyA', 'mdtA', 'mdtE', 'mdtN', 'adeT'}
# RND = set(mgg[mgg[2].str.contains('permease')][1]) | {'mdtO', 'mmpL5', 'mmpL7'}
# OMP = set(mgg[mgg[2].str.contains('outer')][1]) | {'pseA'}
RND = set(mgg[1])

In [None]:
records = []
with open('sarg_cluster.fasta') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        id = record.id.split('|')
        if id[2] in RND:
            id[1] = 'multidrug@RND'
        elif id[2] in MFS:
            id[1] = 'multidrug@MFS'
        elif id[2] in ABC:
            id[1] = 'multidrug@ABC'
        elif id[2] in SMR:
            id[1] = 'multidrug@SMR'
        elif id[2] in MATE:
            id[1] = 'multidrug@MATE'
        else:
            id = id
        record.id = '|'.join(id)
        record.description = record.description.split(' ', 1)[-1]
        records.append(record)

with open('sarg.fasta', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')