In [1]:
import json5
import glob
import pandas as pd
import regex as re
import shutil
import os

from Bio import SeqIO
from collections import defaultdict

In [2]:
dups = set()
with open('reference/reference.fasta') as f:
    for record in SeqIO.parse(f, 'fasta'):
        if record.seq not in dups:
            dups.add(record.seq)
        else:
            print(record.id)

In [3]:
%run a0-parse-refs.ipynb

In [4]:
seq2description = dict()
with open('tmp/seq2description.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        seq2description[record.seq] = record.description.split(' >')[0]

seq2source = dict()
with open('tmp/seq2source.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        seq2source[record.seq] = record.id

records = []
lines = []
with open('tmp/seq.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        desc = seq2description.get(record.seq)
        if not desc:
            print(record.id)
            continue

        line = [record.id, seq2source.get(record.seq), desc.split(' ', 1)[-1].split(' [')[0].split('MULTISPECIES: ')[-1]]
        record.id = '|'.join(record.id.split('|')[:3] + [desc.split(' ', 1)[0]])
        record.description = desc.split(' ', 1)[-1]
        records.append(record)

        lines.append(line + [record.id])

with open('sarg_ref.fa', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

summary = pd.DataFrame(lines, columns = ['id', 'source', 'description', 'sarg'])
summary['type'] = summary['id'].str.split('|').str.get(1)
summary['subtype'] = summary['id'].str.split('|').str.get(2)
summary['accession'] = summary['id'].str.split('|').str.get(3)

cols = ['type', 'subtype', 'sarg', 'source', 'description']
summary[cols].sort_values(cols).to_csv('misc/summary.tsv', index=False, sep='\t')

SARG|multidrug|TEM-117|AAN05026.1
SARG|penicillin_beta-lactam|OXA-36|AAG24866.1
SARG|penicillin_beta-lactam|LEN-6|AAP93848.1
SARG|multidrug|SHV-21|AAF34335.1
SARG|multidrug|SHV-23|AAF34337.1
SARG|multidrug|SHV-22|AAF34336.1
SARG|multidrug|SHV-20|AAF34334.1
SARG|multidrug|TEM-192|AEQ59620.1


In [5]:
%%bash
mmseqs easy-cluster sarg_ref.fa tmp/sarg_ref tmp/TMP \
    --cov-mode 0 -c 0.95 --min-seq-id 0.95 -s 7.5 --cluster-reassign \
    --threads 48 > /dev/null

In [6]:
## check whether some subtypes are clustered at 95%
c = pd.read_table('tmp/sarg_ref_cluster.tsv', header=None)
c['left'] = c[0].str.split('|').str.get(-2)
c['right'] = c[1].str.split('|').str.get(-2)

c['id'] = c.left == c.right
c[0] = c[0].str.split('|').str.get(-1)
c[1] = c[1].str.split('|').str.get(-1)

print(c[c.id==False].groupby(['left',  'right'], as_index=False).size().tail(60))

Empty DataFrame
Columns: [left, right, size]
Index: []


In [7]:
if not os.path.isdir('info'):
    os.makedirs('info')
else:
    shutil.rmtree('info')
    os.makedirs('info')

viewed = set()
summary = pd.read_table('misc/summary.tsv')
cols = ['type', 'subtype', 'source', 'sarg', 'description']
for row, group in summary[~(summary.type.isin(viewed) | (summary.subtype.isin(viewed)))].groupby('type'):
    group['misc'] = '"' + group['source'] + '", // ' + group.description
    group.sort_values(cols).set_index(['type', 'subtype', 'source']).to_excel(f"info/{row[:31].replace('/', '-')}.xlsx")

In [8]:
%%bash
wget -qN https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest/ReferenceGeneHierarchy.txt -P tmp
wget -qN https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest/ReferenceGeneCatalog.txt -P tmp

In [9]:
a = pd.read_table('tmp/ReferenceGeneHierarchy.txt').set_index('node_id').parent_node_id.to_dict()
a['ALL'] = None
b = dict()
for i, j in a.items():
    if i != 'ALL':
        b[i] = i + ';' + j
        while True:
            if a.get(b[i].split(';')[-1]) is None:
                break
            else:
                b[i] = b[i] + ';' + a.get(b[i].split(';')[-1])

ref = pd.read_table('tmp/ReferenceGeneCatalog.txt')
ref = ref[ref.type!='VIRULENCE']
ref['hierarchy'] = ref['hierarchy_node'].map(b)
ref[ref['hierarchy_node'].isin(a.values())].groupby(['hierarchy_node', 'hierarchy']).size().head(50)

hierarchy_node   hierarchy                                                       
aac(3)-I         aac(3)-I;aac(3)_gen;aac;AME;AMR;ALL                                  3
aac(3)-VIII      aac(3)-VIII;aac(3);aac(3)_gen;aac;AME;AMR;ALL                        3
aac(6')-29       aac(6')-29;aac(6');aac;AME;AMR;ALL                                   1
aac(6')-Ian      aac(6')-Ian;aac(6');aac;AME;AMR;ALL                                  2
aac(6')-Ib-AGKT  aac(6')-Ib-AGKT;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL     2
aac(6')-Ib-AKT   aac(6')-Ib-AKT;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL     19
aac(6')-Ib-G     aac(6')-Ib-G;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL       25
aac(6')-Ib-cr    aac(6')-Ib-cr;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL       1
aac(6')-set_A    aac(6')-set_A;aac(6');aac;AME;AMR;ALL                                7
aac(6')_Acine    aac(6')_Acine;aac(6')-I;aac(6')_E;aac(6');aac;AME;AMR;ALL            1
aac(6')_Entco    aac(6')_Entco;aac(6')

In [10]:
# summary[(summary.type == 'capuramycin')].ref.to_csv('tmp/id.txt', index=False, header=None)

In [11]:
# %%bash
# seqkit grep -f tmp/id.txt tmp/subject.fa > tmp/query.fa
# diamond blastp -q tmp/query.fa -d tmp/subject.fa \
#     --outfmt 6 qseqid sseqid pident qcovhsp scovhsp evalue bitscore \
#     -k 0 --masking 0 > tmp/hit.txt

In [12]:
# left = pd.read_table('tmp/id.txt', header=None)
# right = pd.read_table('tmp/hit.txt', header=None)
# tmp = pd.merge(left, right, how='left')
# tmp = tmp[tmp[0].str.split('|').str.get(0)!=tmp[1].str.split('|').str.get(0)].sort_values([0,6], ascending=False).groupby(0).head(1).set_index(0).sort_index()
# tmp = tmp[tmp.index.str.contains('CARD')]

# for i,j in tmp.iterrows():
#     print(f"\"{i}\", // {j[1]} | id: {j[2]} qcov: {j[3]} scov: {j[4]}")