In [1]:
import json5
import glob
import pandas as pd
import regex as re
from Bio import SeqIO
from collections import defaultdict

In [2]:
dups = set()
with open('reference/reference.fasta') as f:
    for record in SeqIO.parse(f, 'fasta'):
        if record.seq not in dups:
            dups.add(record.seq)
        else:
            print(record.id)

In [3]:
%run a0-parse-refs.ipynb

[INFO][0m read sequences ...
[INFO][0m 584 sequences loaded
[INFO][0m sorting ...
[INFO][0m output ...
[INFO][0m read sequences ...
[INFO][0m 7492 sequences loaded
[INFO][0m sorting ...
[INFO][0m output ...


In [4]:
%%bash
mmseqs easy-cluster misc/init.fa tmp/init tmp/init \
    --cov-mode 0 -c 0.95 --min-seq-id 0.95 -s 7.5 --cluster-reassign \
    --threads 64 > /dev/null
rm -rf tmp/init

In [5]:
## check whether some subtypes are clustered at 95%
c = pd.read_table('tmp/init_cluster.tsv', header=None)
c['left'] = c[0].str.split('|').str.get(-2)
c['right'] = c[1].str.split('|').str.get(-2)

c['id'] = c.left == c.right
c[0] = c[0].str.split('|').str.get(-1)
c[1] = c[1].str.split('|').str.get(-1)

print(c[c.id==False].groupby(['left',  'right'], as_index=False).size().tail(60))

Empty DataFrame
Columns: [left, right, size]
Index: []


In [6]:
added = {}
discarded = set()

with open('sarg.json', 'r') as f:
    for i,j in json5.load(f).items():
        if i != 'discarded':
            for k,l in j.items():
                if k == 'discarded':
                    discarded.update(l)
                if k == 'added':
                    for m,n in l.items():
                        added[m] = 'REF|' + i + '|' + n + '|' + m
        else:
            discarded.update(j)


seq2desc = dict()
seq2spec = dict()
with open('tmp/sarg2nr.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        seq2desc[record.seq] = record.description.split(' >')[0].split(' ',1)[1].split(' [')[0].split('MULTISPECIES: ')[-1]
        seq2spec[record.seq] = record.description.split(' >')[0].split('[')[-1].split(']')[0]

seq2sarg = dict()
with open('sarg_ref.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        seq2sarg[record.seq] = record.id

In [7]:
seqs = dict()
dups = set()
with open('reference/sequence.fasta') as f:
    for record in SeqIO.parse(f, 'fasta'):
        if record.seq not in dups and record.id not in discarded:
            dups.add(record.seq)
            seqs[record.seq] = record.id

with open('reference/reference.fasta') as f:
    for record in SeqIO.parse(f, 'fasta'):
        if record.seq not in dups and record.id not in discarded:
            dups.add(record.seq)
            seqs[record.seq] = added.get(record.id)

rows = []
records = []
with open('misc/init.fa') as f:
    for record in SeqIO.parse(f, 'fasta'):
        rows.append([record.id, seqs.get(record.seq), seq2sarg.get(record.seq), seq2spec.get(record.seq), seq2desc.get(record.seq)])
        record.id = seqs.get(record.seq)
        records.append(record)

with open('tmp/subject.fa', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

rows = pd.DataFrame(rows, columns = ['id', 'ref', 'sarg', 'spec', 'desc'])
rows['type'] = rows['id'].str.split('|').str.get(1)
rows['subtype'] = rows['id'].str.split('|').str.get(2)
rows['accession'] = rows['id'].str.split('|').str.get(3)

cols = ['type', 'subtype', 'ref', 'sarg', 'desc']
rows[cols].sort_values(cols).to_csv('misc/sarg.tsv', index=False, sep='\t')

In [8]:
viewed = set()
with open('misc/viewed.txt') as f:
    for line in f:
        if line[0]!='#' and line!='\n':
            viewed.update(set(line.rstrip().split(' ')))

In [9]:
%%bash
rm -rf info
mkdir -p info

In [10]:
for row, group in rows[cols + ['spec']][~(rows.type.isin(viewed) | (rows.subtype.isin(viewed)))].groupby('type'):
    group['misc'] = '"' + group['ref'] + '", // ' + group.desc
    group.sort_values(cols).set_index(['type', 'subtype', 'ref', 'spec']).to_excel(f"info/{row[:31].replace('/', '-')}.xlsx")

In [11]:
%%bash
wget https://ftp.ncbi.nlm.nih.gov/pathogen/Antimicrobial_resistance/AMRFinderPlus/database/latest/ReferenceGeneHierarchy.txt -P tmp -qN

In [12]:
a = pd.read_table('tmp/ReferenceGeneHierarchy.txt').set_index('node_id').parent_node_id.to_dict()
a['ALL'] = None
b = dict()
for i, j in a.items():
    if i != 'ALL':
        b[i] = i + ';' + j
        while True:
            if a.get(b[i].split(';')[-1]) is None:
                break
            else:
                b[i] = b[i] + ';' + a.get(b[i].split(';')[-1])

ref = pd.read_table('reference/refgenes.tsv')
ref = ref[ref.Type!='VIRULENCE']
ref['hierarchy'] = ref['#Hierarchy node ID'].map(b)
ref[ref['#Hierarchy node ID'].isin(a.values())].groupby(['#Hierarchy node ID', 'hierarchy']).size().head(50)

#Hierarchy node ID  hierarchy                                                       
aac(3)-I            aac(3)-I;aac(3)_gen;aac;AME;AMR;ALL                                  3
aac(3)-VIII         aac(3)-VIII;aac(3);aac(3)_gen;aac;AME;AMR;ALL                        3
aac(6')-29          aac(6')-29;aac(6');aac;AME;AMR;ALL                                   1
aac(6')-Ian         aac(6')-Ian;aac(6');aac;AME;AMR;ALL                                  2
aac(6')-Ib-AGKT     aac(6')-Ib-AGKT;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL     2
aac(6')-Ib-AKT      aac(6')-Ib-AKT;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL     19
aac(6')-Ib-G        aac(6')-Ib-G;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL       25
aac(6')-Ib-cr       aac(6')-Ib-cr;aac(6')-Ib;aac(6')-set_A;aac(6');aac;AME;AMR;ALL       1
aac(6')-set_A       aac(6')-set_A;aac(6');aac;AME;AMR;ALL                                7
aac(6')_Acine       aac(6')_Acine;aac(6')-I;aac(6')_E;aac(6');aac;AME;AMR;ALL            1
aac(6

In [13]:
# rows[(rows.type == 'capuramycin')].ref.to_csv('tmp/id.txt', index=False, header=None)

In [14]:
# %%bash
# seqkit grep -f tmp/id.txt tmp/subject.fa > tmp/query.fa
# diamond blastp -q tmp/query.fa -d tmp/subject.fa --outfmt 6 qseqid sseqid pident qcovhsp scovhsp evalue bitscore \
#     -k 0 --masking 0 > tmp/hit.txt

In [15]:
# left = pd.read_table('tmp/id.txt', header=None)
# right = pd.read_table('tmp/hit.txt', header=None)
# tmp = pd.merge(left, right, how='left')
# tmp = tmp[tmp[0].str.split('|').str.get(0)!=tmp[1].str.split('|').str.get(0)].sort_values([0,6], ascending=False).groupby(0).head(1).set_index(0).sort_index()
# tmp = tmp[tmp.index.str.contains('CARD')]

# for i,j in tmp.iterrows():
#     print(f"\"{i}\", // {j[1]} | id: {j[2]} qcov: {j[3]} scov: {j[4]}")