In [1]:
import json5
from collections import defaultdict
import glob
import pandas as pd
import regex as re
from Bio import SeqIO

In [2]:
%run a1-curate-seqs.ipynb

[INFO][0m read sequences ...
[INFO][0m 493 sequences loaded
[INFO][0m sorting ...
[INFO][0m output ...


unused: REF|NA|Rv1877|sp|P9WG85|Y1877_MYCTU
unused: REF|NA|vraF|WP_001637630.1
unused: REF|NA|nicT|sp|I6YEJ7|NICT_MYCTU
unused: REF|NA|protein|WP_003722374.1
unused: REF|NA|transporter|WP_000473297.1
unused: REF|NA|mmpS5|sp|P9WJS7|MMPS5_MYCTU
unused: REF|NA|SAV1866|sp|Q99T13|Y1866_STAAM
unused: REF|NA|transporter|WP_000905722.1
unused: REF|NA|Rv1273c|sp|P9WQJ1|Y1273_MYCTU
unused: REF|NA|Rv1410c|sp|P9WJY3|MFS55_MYCTU
unused: REF|NA|kac|tr|Q75PS3|Q75PS3_STRKN
unused: REF|NA|Rv2686c|sp|P9WJB3|FLQE2_MYCTU
unused: REF|NA|Rv0194|sp|O53645|MDREP_MYCTU
unused: REF|NA|Rv1218c|sp|O86311|MEATP_MYCTU
unused: REF|NA|vraG|WP_002457535.1
unused: REF|NA|vraF|WP_020363729.1
unused: REF|NA|kmr|tr|Q75PS4|Q75PS4_STRKN
unused: REF|NA|Rv1217c|sp|O05318|MEPRM_MYCTU
unused: REF|NA|Rv2688c|sp|P9WQL7|FLQE1_MYCTU
unused: REF|NA|Rv0191|sp|P9WJX7|CHLEP_MYCTU
unused: REF|NA|Rv1634|sp|P9WJX3|Y1634_MYCTU
unused: REF|NA|vraG|WP_000143623.1
unused: REF|NA|transporter|WP_003119853.1
unused: REF|NA|permease|WP_010989900.

[INFO][0m read sequences ...
[INFO][0m 7393 sequences loaded
[INFO][0m sorting ...
[INFO][0m output ...


In [3]:
%%bash
mkdir -p tmp
mmseqs easy-cluster misc/init.fasta tmp/init tmp/init \
    --cov-mode 0 -c 0.95 --min-seq-id 0.95 -s 7.5 --cluster-reassign \
    --threads 64 > /dev/null
rm -rf tmp/init

In [4]:
c = pd.read_table('tmp/init_cluster.tsv', header=None)
c['left'] = c[0].str.split('|').str.get(-2)
c['right'] = c[1].str.split('|').str.get(-2)

c['id'] = c.left == c.right
c[0] = c[0].str.split('|').str.get(-1)
c[1] = c[1].str.split('|').str.get(-1)

print(c[c.id==False].groupby(['left',  'right'], as_index=False).size().tail(60))

Empty DataFrame
Columns: [left, right, size]
Index: []


In [5]:
c[c.id==False]

Unnamed: 0,0,1,left,right,id


In [6]:
added = {}
discarded = set()
with open('sarg.json', 'r') as f:
    for i,j in json5.load(f).items():
        if i != 'discarded':
            for k,l in j.items():
                if k == 'discarded':
                    discarded.update(l)
                if k == 'added':
                    for m,n in l.items():
                        added[m] = 'REF|' + i + '|' + n + '|' + m
        else:
            discarded.update(j)

bxy = dict()
with open('misc/sarg2nr.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        bxy[record.seq] = record.description.split(' >')[0].split(' ',1)[1].rsplit(' [')[0].split('MULTISPECIES: ')[-1]
with open('misc/sarg2env_nr.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if record.seq not in bxy:
            bxy[record.seq] = record.description.split(' >')[0].split(' ',1)[1].rsplit(' [')[0].split('MULTISPECIES: ')[-1]

seq = dict()
dups = set()
with open('reference/sequence.fasta') as f:
    for record in SeqIO.parse(f, 'fasta'):
        if record.seq not in dups and record.id not in discarded:
            dups.add(record.seq)
            seq[record.seq] = record.id

with open('reference/reference.fasta') as f:
    for record in SeqIO.parse(f, 'fasta'):
        if record.seq not in dups and record.id not in discarded:
            dups.add(record.seq)
            seq[record.seq] = added.get(record.id)

rec = []
records = []
with open('misc/init.fasta') as f:
    for record in SeqIO.parse(f, 'fasta'):
        rec.append([record.id, seq.get(record.seq), bxy.get(record.seq), record.description.split('[')[-1].split(']')[0]])
        record.id = seq.get(record.seq)
        records.append(record)

with open('tmp/subject.fa', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

rec = pd.DataFrame(rec, columns = ['SARG', 'ref', 'desc', 'spec'])
rec['type'] = rec['SARG'].str.split('|').str.get(1)
rec['subtype'] = rec['SARG'].str.split('|').str.get(2)
rec['accession'] = rec['SARG'].str.split('|').str.get(3)

cols = ['type', 'subtype', 'ref', 'desc', 'spec']
rec[cols].sort_values(cols).to_csv('misc/sarg.tsv', index=False, sep='\t')

In [7]:
# %%bash
# wget https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest/ReferenceGeneHierarchy.txt -P tmp -q

In [8]:
a = pd.read_table('tmp/ReferenceGeneHierarchy.txt').set_index('node_id').parent_node_id.to_dict()
a['ALL'] = None
b = dict()
for i,j in a.items():
    if i!='ALL':
        b[i] = i + ';' + j
        while True:
            if a.get(b[i].split(';')[-1]) is None:
                break
            else:
                b[i] = b[i] + ';' + a.get(b[i].split(';')[-1])

ref = pd.read_table('reference/refgenes.tsv')
ref = ref[ref.Type!='VIRULENCE']
nset = {
    "aac(6')-29",
    "aac(6')-set_A",
    "aac(6')_Serra",
    "aac(6')_Steno",
    "aac(6')_Strep",
}
# rec = rec[rec.accession.isin(set(ref[ref['#Hierarchy node ID'].isin(nset)]['RefSeq protein'].to_list()))]
ref['hierarchy'] = ref['#Hierarchy node ID'].map(b)
ref[ref['#Hierarchy node ID'].isin(a.values())].groupby(['#Hierarchy node ID', 'hierarchy']).size().head(50)

#Hierarchy node ID  hierarchy                                                       
aac(3)-I            aac(3)-I;aac(3)_gen;aac;AME;AMR;ALL                                  3
aac(3)-VIII         aac(3)-VIII;aac(3);aac(3)_gen;aac;AME;AMR;ALL                        3
aac(6')-29          aac(6')-29;aac(6');aac;AME;AMR;ALL                                   1
aac(6')-Ian         aac(6')-Ian;aac(6');aac;AME;AMR;ALL                                  2
aac(6')-Ib-AGKT     aac(6')-Ib-AGKT;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL     2
aac(6')-Ib-AKT      aac(6')-Ib-AKT;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL     19
aac(6')-Ib-G        aac(6')-Ib-G;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL       25
aac(6')-Ib-cr       aac(6')-Ib-cr;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL       1
aac(6')-set_A       aac(6')-set_A;aac(6');aac;AME;AMR;ALL                                7
aac(6')_Acine       aac(6')_Acine;aac(6')-I;aac(6')_E;aac(6');aac;AME;AMR;ALL            1
aac(6

In [9]:
viewed = set()
with open('misc/viewed.txt') as f:
    for line in f:
        if line[0]!='#' and line!='\n':
            viewed.update(set(line.rstrip().split(' ')))

In [10]:
%%bash
rm -rf info
mkdir -p info

In [11]:
for row, group in rec[cols][~(rec.type.isin(viewed) | (rec.subtype.isin(viewed)))].groupby('type'):
    group['misc'] = '"' + group['ref'] + '", // ' + group.desc
    group.sort_values(cols).set_index(['type', 'subtype', 'ref']).to_excel(f"info/{row[:31].replace('/', '-')}.xlsx")

rec[(rec.type == 'aminoglycoside') & (rec.ref.str.contains('CARD'))].ref.to_csv('tmp/id.txt', index=False, header=None)

In [12]:
%%bash
seqkit grep -f tmp/id.txt reference/sequence.fasta > tmp/query.fa
diamond blastp -q tmp/query.fa -d tmp/subject.fa --outfmt 6 qseqid sseqid pident qcovhsp scovhsp evalue bitscore \
    -k 0 --no-self-hits > tmp/hit.txt

[INFO][0m 2 patterns loaded from file


In [13]:
left = pd.read_table('tmp/id.txt', header=None)
right = pd.read_table('tmp/hit.txt', header=None)
tmp = pd.merge(left, right, how='left')
tmp = tmp[tmp[0].str.split('|').str.get(0)!=tmp[1].str.split('|').str.get(0)].sort_values([0,6], ascending=False).groupby(0).first()
# tmp = tmp[tmp[2] > 90]

for i,j in tmp.iterrows():
    print(f"\"{i}\", // {j[1]} | id: {j[2]} qcov: {j[3]} scov: {j[4]}")

"CARD|homolog|aminoglycoside|AAC(6')-34|APB03223.1", // NCBI|amr:amr|aminoglycoside|aac(6')-35|WP_087349651.1 | id: 53.9 qcov: 90.9 scov: 98.3
"CARD|homolog|aminoglycoside|APH(4)-Ib|CAA52372.1", // NCBI|amr:amr|aminoglycoside|aph(4)-Ia|WP_000742814.1 | id: 51.3 qcov: 54.3 scov: 65.7
