In [1]:
# %%bash
# diamond blastp \
#     -q sarg_ref.fa \
#     -d tmp/refseq_protein.dmnd \
#     --out tmp/refseq_protein.txt \
#     --outfmt 6 qseqid sseqid stitle nident qlen slen pident qcovhsp scovhsp bitscore evalue \
#     --id 50 --subject-cover 75 --query-cover 75 \
#     -k 0 --threads 48 --quiet

In [2]:
import pandas as pd
import time
import glob

from collections import defaultdict
from tqdm import tqdm
from Bio import SeqIO, Entrez
Entrez.email = 'A.N.Other@example.com'

nr = set()
with open('tmp/sarg2nr.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        nr.add(record.description.split(' ',1)[1].split(' >')[0].rsplit(' [')[0].split('MULTISPECIES: ')[-1])

rows = set()
headers = set()
with open('tmp/refseq_protein.txt') as f:
    for line in f:
        ls = line.rstrip().split('\t')
        qseqid = ls[0].split('|')
        type, subtype = qseqid[1], qseqid[2]
        accession, header =  ls[1], ls[2].split(' >')[0].split(' ',1)[1].rsplit(' [')[0].split('MULTISPECIES: ')[-1]
        if 'hypothetical' not in header and ', partial' not in header:
            rows.add((type, subtype, header, accession))
            headers.add(header)

rows = pd.DataFrame(rows, columns = ['type', 'subtype', 'header', 'accession'])
header2subtype = rows.groupby('header').subtype.apply(lambda x: ','.join(sorted(set(x)))).to_dict()

In [3]:
records = []
cnt = 0
counter = defaultdict(lambda: 0)
with open('tmp/refseq_protein.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        if ', partial' not in record.description:
            id = record.description.split(' >')[0].split(' ',1)[1].rsplit(' [')[0].split('MULTISPECIES: ')[-1]
            if id in headers:
                records.append(record)
                counter[id] += 1
                cnt += 1
                if cnt % 2500000 == 0:
                    print('done:', cnt)

done: 2500000
done: 5000000
done: 7500000
done: 10000000
done: 12500000
done: 15000000
done: 17500000
done: 20000000
done: 22500000
done: 25000000
done: 27500000
done: 30000000


In [4]:
%%bash
wget -qN https://ftp.ncbi.nlm.nih.gov/hmm/current/hmm_PGAP.tsv -P tmp

In [5]:
hmm = pd.read_table('tmp/hmm_PGAP.tsv').fillna('NA')
hmm.loc[hmm['source'] == 'NCBI Protein Cluster (PRK)', 'source'] = 'PRK'
hmm['evidence'] = hmm['source'] + '@' + hmm['#ncbi_accession'].str.split('.').str.get(0) + '@' + hmm['gene_symbol']
hmm = hmm.groupby('product_name').evidence.apply(lambda x: ','.join(sorted(x)))

In [6]:
nset = {

    ## unnamed
    "unnamed protein product", # 29836 | seen: True | NA | aceI,acrA,acrB,acrD,acrE,acrF,adeE,adeJ,aheA,aheB,ameB,amrB,amvA,arnA,arnB,arnC,arnD,arnE,arnF,axyA,axyB,axyY,bcr,bepC,bepD,bepE,blaOXA,bpeA,bpeB,cmeB,cprA,dltC,eefB,emhA,emhB,emrA,emrB,emrK,emrY,eptB,farB,fsr,kmrA,mdtA,mdtB,mdtC,mdtE,mdtF,mdtQ,mexA,mexB,mexY,mph(E),mph(G),msr(D),msr(E),muxB,muxC,rosA,sdeX,sdeY,smdA,smdB,smeB,smeD,smeE,smvA,srpA,srpB,sugE,tbtA,tbtB,tet(39),tet(41),tet(64),tet(C),tolC,ttgA,ttgB,ttgE,vmeB


    ## oxidoreductase
    "SDR family oxidoreductase", # 1128079 | seen: False | EBI-EMBL@NF019605@NA,EBI-EMBL@NF024950@NA,NCBIFAM@NF038213@NA,PRK@NF004110@NA,PRK@NF004196@NA,PRK@NF004284@NA,PRK@NF004324@NA,PRK@NF004513@NA,PRK@NF004514@NA,PRK@NF004521@NA,PRK@NF004522@NA,PRK@NF004528@NA,PRK@NF004534@NA,PRK@NF004649@NA,PRK@NF004765@NA,PRK@NF004774@NA,PRK@NF004777@NA,PRK@NF004778@NA,PRK@NF004782@NA,PRK@NF004791@NA,PRK@NF004792@NA,PRK@NF004818@NA,PRK@NF004825@NA,PRK@NF005065@NA,PRK@NF005075@NA,PRK@NF005214@NA,PRK@NF005372@NA,PRK@NF005395@NA,PRK@NF005400@NA,PRK@NF005402@NA,PRK@NF005403@NA,PRK@NF005436@NA,PRK@NF005437@NA,PRK@NF005446@NA,PRK@NF005449@NA,PRK@NF005467@NA,PRK@NF005468@NA,PRK@NF005469@NA,PRK@NF005473@NA,PRK@NF005474@NA,PRK@NF005489@NA,PRK@NF005495@NA,PRK@NF005539@NA,PRK@NF005594@NA,PRK@NF005672@NA,PRK@NF005681@NA,PRK@NF005711@NA,PRK@NF005752@NA,PRK@NF005753@NA,PRK@NF005853@NA,PRK@NF005854@NA,PRK@NF005861@NA,PRK@NF005868@NA,PRK@NF005873@NA,PRK@NF005880@NA,PRK@NF005881@NA,PRK@NF005893@NA,PRK@NF005939@NA,PRK@NF005950@NA,PRK@NF006035@NA,PRK@NF006073@NA,PRK@NF006099@NA,PRK@NF006115@NA,PRK@NF006116@NA,PRK@NF006117@NA,PRK@NF006118@NA,PRK@NF006119@NA,PRK@NF006120@NA,PRK@NF006123@NA,PRK@NF006133@NA,PRK@NF006159@NA,PRK@NF006213@NA,PRK@NF006366@NA,PRK@NF006384@NA,PRK@NF006431@NA,PRK@NF006464@NA,PRK@NF006532@NA,PRK@NF006565@NA,PRK@NF006597@NA,PRK@NF006693@NA,PRK@NF006776@NA,PRK@NF007273@NA,PRK@NF009383@bdcA,PRK@NF009384@NA,PRK@NF009385@NA,PRK@NF009387@NA,PRK@NF009389@NA,PRK@NF009465@NA,PRK@NF009468@NA,PRK@NF009469@NA,PRK@NF009499@NA | cprA
    "NAD(P)/FAD-dependent oxidoreductase", # 726241 | seen: False | EBI-EMBL@NF015451@NA | tet(X)
    "FAD-dependent oxidoreductase", # 521441 | seen: False | EBI-EMBL@NF013314@NA,EBI-EMBL@NF013434@NA,EBI-EMBL@NF013739@NA,EBI-EMBL@NF019604@NA,EBI-EMBL@NF024240@NA,JCVI@TIGR01677@NA,PRK@NF004789@NA,PRK@NF004833@NA,PRK@NF004834@NA,PRK@NF004835@NA,PRK@NF006002@NA,PRK@NF007450@NA,PRK@NF009476@NA,PRK@NF009477@NA | iri,rox,tet(X)
    "NAD(P)-dependent oxidoreductase", # 349802 | seen: False | EBI-EMBL@NF014841@NA,EBI-EMBL@NF024638@NA,PRK@NF004045@NA | vanH
    "GMC family oxidoreductase", # 99601 | seen: True | NA | capO
    "GMC family oxidoreductase N-terminal domain-containing protein", # 16083 | seen: False | EBI-EMBL@NF012935@NA | capO


    ## hydrolase
    "alpha/beta fold hydrolase", # 997499 | seen: False | EBI-EMBL@NF012770@NA,EBI-EMBL@NF024109@NA | albD,estT,nonR
    "alpha/beta hydrolase", # 846732 | seen: False | EBI-EMBL@NF017773@NA,EBI-EMBL@NF017808@NA,EBI-EMBL@NF018018@NA,EBI-EMBL@NF018091@NA,EBI-EMBL@NF018234@NA,EBI-EMBL@NF018525@NA,EBI-EMBL@NF018834@NA,EBI-EMBL@NF019982@NA,EBI-EMBL@NF024107@NA | albD,bahA,estDL136,estT,nonR
    "MBL fold metallo-hydrolase", # 586531 | seen: False | EBI-EMBL@NF012955@NA,EBI-EMBL@NF024118@NA,EBI-EMBL@NF024874@NA,PRK@NF002553@NA | blaAFM,blaB3SU1,blaCAR,blaCRD3,blaEAM,blaECM,blaEFM,blaELM,blaEVM,blaLMB,blaNWM,blaSIE,blaTHIN,cphA,varG
    "serine hydrolase", # 330236 | seen: False | EBI-EMBL@NF012371@NA,EBI-EMBL@NF024748@NA | bla*,bla1,bla3,blaAAK,blaACT,blaAQU,blaBBI,blaBCL,blaBUT,blaCBP,blaCDA,blaCDD,blaCMH,blaCMY,blaCTX-M,blaDHA,blaEC,blaEXO,blaFONA,blaGIL,blaIDC,blaKLUC,blaL2,blaLAQ,blaLEN,blaLRA,blaMIR,blaOHIO,blaOKP,blaOXA,blaOXY,blaP,blaPDC,blaPRC,blaPSZ,blaRAHN,blaRHO,blaRUB,blaSED,blaSFDC,blaSHV,blaTEM,blaTER,blaYOC,blaYRC,blaZ,cepH,cepS,cfxA,hugA
    "dienelactone hydrolase family protein", # 118684 | seen: False | EBI-EMBL@NF013866@NA | albD
    "cysteine hydrolase family protein", # 82604 | seen: False | NA | sttH
    "sulfatase-like hydrolase/transferase", # 69373 | seen: False | EBI-EMBL@NF013080@NA | eptA
    "alpha/beta hydrolase fold domain-containing protein", # 22427 | seen: False | EBI-EMBL@NF019474@NA | estDL136
    "guanitoxin biosynthesis MBL fold metallo-hydrolase GntH", # 607 | seen: False | NCBIFAM@NF041257@gntH | blaPNGM
    "esterase, hydrolase", # 1 | seen: False | NA | albD

    
    ## monooxygenase|dehydrogenase
    "acyl-CoA dehydrogenase family protein", # 601368 | seen: False | EBI-EMBL@NF012656@NA,EBI-EMBL@NF014789@NA,EBI-EMBL@NF014790@NA | sulX
    "FAD-dependent monooxygenase", # 161002 | seen: False | EBI-EMBL@NF013646@NA | iri,rox,tet(47),tet(48),tet(49),tet(50),tet(51),tet(54),tet(55),tet(56)
    "nucleotide sugar dehydrogenase", # 67230 | seen: False | JCVI@TIGR03026@NA | ugd
    "D-isomer specific 2-hydroxyacid dehydrogenase family protein", # 9182 | seen: False | NA | vanH
    "K5 polysaccharide biosynthesis UDP-glucose dehydrogenase KfiD", # 21 | seen: False | NA | ugd
        

    ## transporter|secretion|membrane
    "ABC transporter ATP-binding protein", # 2854546 | seen: True | PRK@NF007921@NA,PRK@NF008358@NA | abcA,anrA,aviABC-1,bceA,bcrA,bmrA,bmrC,bmrD,cprA,derA,efrA,efrB,efrC,efrD,expB,horA,lieA,lmrA,lmrC,lmrD,lnrL,lugG,narA,novA,nsrF,ole(C)-1,patA,patB,pmtC,psdA,ranA,rapA,satA,satB,smrA,tetA,tetA(46),tetA(58),tetA(60),tetB,tetB(46),tetB(60),tnrB-1,vcaM,vraD,ysaC,yxdL
    "MFS transporter", # 2516997 | seen: True | EBI-EMBL@NF012312@NA,EBI-EMBL@NF013473@NA,EBI-EMBL@NF017760@NA,EBI-EMBL@NF019310@NA,EBI-EMBL@NF023128@NA,EBI-EMBL@NF024241@NA,EBI-EMBL@NF024742@NA,JCVI@TIGR00903@NA,NCBIFAM@NF037955@NA,NCBIFAM@NF037960@NA,NCBIFAM@NF043065@NA,PRK@NF002962@NA,PRK@NF003477@NA,PRK@NF008963@NA | aadT,abaF,abaQ,albF,amvA,bahA,bcmT,bmr1,bmr2,cme,cmlA,cmlB,cmlR,cmr,cmrA,cmx,couR5,craA,efmA,efpA,emeA,emrD,facT,fexA,fexB,fsr,jefA,kdeA,kmrA,lde,lfrA,lmr(A),lmr(B),lmrP,lmrS,mdeA,mdfA,mdrL,mdrP,mdt(A),mdt(P),mdtG,mdtH,mdtL,mdtM,mef(A),mef(B),mef(C),mef(D),mef(E),mef(F),mef(H),mef(J),mmr,mrx(A),norB,norC,otr(B),pep,pmrA,ptr,pur8,qacA/B,qepA,rosA,sdrM,simX,slgT,smfY,smvA,stp,tap,tcmA,tcr3,tcrA,tet(30),tet(38),tet(39),tet(40),tet(41),tet(42),tet(43),tet(45),tet(62),tet(63),tet(64),tet(A),tet(B),tet(C),tet(D),tet(G),tet(K),tet(L),tet(V),tet(Y),tetA(P),varS
    "ABC transporter permease", # 2252892 | seen: True | EBI-EMBL@NF013245@NA,EBI-EMBL@NF014460@NA,EBI-EMBL@NF015605@NA,EBI-EMBL@NF017759@NA,EBI-EMBL@NF024110@NA,EBI-EMBL@NF024116@NA,EBI-EMBL@NF024141@NA,JCVI@TIGR01581@NA,NCBIFAM@NF038017@NA,PRK@NF011648@NA | anrB,aviABC-2,bceB,bcrB,derB,expA,lieB,lnrM,lnrN,macB,narB,nsrP,ole(C)-2,otr(C)-2,psdB,ranB,rapB,tetB(58),vraE,ysaB,yxdM
    "DMT family transporter", # 625911 | seen: False | EBI-EMBL@NF016535@NA,EBI-EMBL@NF017468@NA,NCBIFAM@NF038012@NA | abeS,bcrB,bcrC,ebrA,ebrB,emrC,emrE,hsmR,mmr,qac*,qacE,qacF/L,qacG,qacH/Z,qacJ,qacK,ssmE,sugE,ykkC,ykkD
    "efflux RND transporter periplasmic adaptor subunit", # 440548 | seen: True | EBI-EMBL@NF027890@NA,JCVI@TIGR01730@NA | acrA,acrE,adeA,adeD,adeF,adeI,aheA,ameA,amrA,axyA,axyX,bepD,bepF,bpeA,bpeE,cmeA,cmeE,eefA,emhA,emrA,emrK,farA,hefB,lpeA,macA,mdsA,mdtA,mdtE,mexA,mexC,mexE,mexH,mexJ,mexM,mexP,mexV,mexX,mtrC,muxA,oqxA,pseB,sdeA,sdeX,smeA,smeD,srpA,tbtA,tmexC,triA,triB,ttgA,ttgD,vceA,vexA,vexC,vexE,vexG,vexI,vexJ,vmeA,vmeC,vmeE,vmeG,vmeH,vmeJ,vmeP,vmeT,vmeU,vmeY
    "efflux RND transporter permease subunit", # 369713 | seen: True | EBI-EMBL@NF013069@NA,JCVI@TIGR00915@NA | acrB,acrD,acrF,adeB,adeE,adeG,adeJ,aheB,ameB,amrB,axyB,axyY,bepE,bepG,bpeB,bpeF,cmeB,cmeF,eefB,emhB,hefC,kexD,lpeB,mdsB,mdtB,mdtC,mdtF,mexB,mexD,mexF,mexI,mexK,mexN,mexQ,mexW,mexY,mtrD,muxB,muxC,oqxB,pseC,sdeB,sdeY,smeB,smeE,srpB,tbtB,tmexD,triC,ttgB,ttgE,vexB,vexD,vexF,vexH,vexK,vmeB,vmeD,vmeF,vmeI,vmeK,vmeQ,vmeV,vmeZ
    "MATE family efflux transporter", # 262674 | seen: True | EBI-EMBL@NF013703@NA,JCVI@TIGR00797@NA,PRK@NF007130@NA | abeM,bexA,cdeA,dinF,emmdR,fepA,hmrM,mdtK,mepA,mmp,norM,pdrM,pmpM,vcmA,vcmB,vcmD,vcmH,vcmN,vcrM,vmrA
    "AI-2E family transporter", # 178500 | seen: False | EBI-EMBL@NF013740@NA,PRK@NF008930@NA | cxpE
    "efflux transporter outer membrane subunit", # 150242 | seen: True | JCVI@TIGR01845@NA | adeC,adeH,adeK,aheC,ameC,axyM,cmeC,eefC,emhC,emrC,mdsC,mdtP,mdtQ,mtrE,opmB,opmD,opmE,oprA,oprB,oprC,oprJ,oprM,oprN,oprZ,pseA,smeC,smeF,srpC,tbtM,toprJ,ttgC,ttgF,vceC
    "HlyD family secretion protein", # 141632 | seen: True | EBI-EMBL@NF012739@NA | emrA,farA,vceA
    "MMPL family transporter", # 124718 | seen: False | EBI-EMBL@NF015155@NA | farE,mmpL5,mmpL7
    "EamA family transporter", # 113222 | seen: False | EBI-EMBL@NF013088@NA | arnE
    "DHA2 family efflux MFS transporter permease subunit", # 98970 | seen: False | JCVI@TIGR00711@NA | albF,bmr3,efpA,emrB,emrY,facT,farB,lmr(A),lmr(B),lmrS,mdeA,mdrM,mdrT,mdt(P),mmr,otr(B),pep,ptr,pur8,simX,slgT,smfY,stp,tcmA,varS,vceB
    "MDR family MFS transporter", # 85151 | seen: True | NA | bmr3,lmr(B),lmrP,lmrS,mdeA,mdrM,mdrP,mdrT,mdt(P),otr(B),simX,tcr3,vceB
    "multidrug effflux MFS transporter", # 79895 | seen: False | NA | emrD
    "multidrug efflux SMR transporter", # 59965 | seen: True | NA | abeS,bcrB,bcrC,ebrA,ebrB,emrC,emrE,hsmR,kpnE,mdtJ,mmr,qac*,qacC,qacE,qacF/L,qacG,qacH/Z,qacJ,qacK,smr,ssmE,sugE,ykkC,ykkD
    "ABC transporter ATP-binding protein/permease", # 56577 | seen: False | NA | abcA,bmrD,efrB,efrC,efrD,horA,lmrA,lmrD,patB,satB,smrA,vcaM
    "tripartite tricarboxylate transporter TctB family protein", # 49120 | seen: False | EBI-EMBL@NF018979@NA | bahA
    "HlyD family efflux transporter periplasmic adaptor subunit", # 45616 | seen: False | EBI-EMBL@NF024112@NA,EBI-EMBL@NF024829@NA | emrA,emrK,farA,hefB,mdtN,vceA
    "RND family transporter", # 39405 | seen: False | NA | mmpL5
    "ABC transporter transmembrane domain-containing protein", # 35429 | seen: False | EBI-EMBL@NF012869@NA | abcA,bmrA,bmrC,bmrD,efrA,efrB,efrC,efrD,lmrC,lmrD,novA,patA,patB,satA,satB,smdA,smdB,tetA,tetA(46),tetB(46)
    "multidrug efflux RND transporter permease subunit", # 24780 | seen: True | NCBIFAM@NF000282@NA,NCBIFAM@NF033617@NA,PRK@NF007131@NA,PRK@NF007133@NA | acrB,acrD,acrF,adeB,adeE,adeG,adeJ,aheB,ameB,amrB,axyB,axyY,bepE,bepG,bpeB,bpeF,cmeB,eefB,emhB,kexD,mdsB,mdtB,mdtC,mdtF,mexB,mexD,mexF,mexI,mexN,mexQ,mexW,mexY,mtrD,muxB,muxC,oqxB,sdeB,sdeY,smeB,smeE,srpB,tbtB,tmexD,ttgB,ttgE,vexB,vexH,vmeB,vmeD,vmeF,vmeQ,vmeZ
    "AbgT family transporter", # 18848 | seen: True | EBI-EMBL@NF015746@NA | mtrF,ydaH
    "PACE efflux transporter", # 9717 | seen: False | NCBIFAM@NF033664@NA | aceI
    "putative ABC transporter permease", # 9233 | seen: False | NA | cmpB
    "SMR family transporter", # 8730 | seen: False | EBI-EMBL@NF013089@NA | abeS,arnE,bcrB,bcrC,ebrA,ebrB,emrC,emrE,hsmR,kpnE,kpnF,mdtI,mdtJ,mmr,qac*,qacC,qacE,qacF/L,qacG,qacH/Z,qacJ,qacK,smr,ssmE,sugE,ykkC,ykkD
    "multidrug efflux MFS transporter", # 7524 | seen: False | NA | lde,mdtG,pmrA

    
    ## transferase
    "GNAT family N-acetyltransferase", # 1929223 | seen: True | EBI-EMBL@NF012792@NA,EBI-EMBL@NF017144@NA,EBI-EMBL@NF020034@NA,EBI-EMBL@NF024157@NA,EBI-EMBL@NF024698@NA,EBI-EMBL@NF024812@NA,EBI-EMBL@NF024871@NA,EBI-EMBL@NF024898@NA,EBI-EMBL@NF024913@NA,EBI-EMBL@NF024917@NA,EBI-EMBL@NF025054@NA,EBI-EMBL@NF025095@NA,PRK@NF007338@NA,PRK@NF007644@NA | aac(2')*,aac(2')-I,aac(2')-II,aac(3)-I,aac(3)-XI,aac(6')*,aac(6')-I,aac(6')-II,aac(6')-III,blmB,cpaA,edeQ,eis,nat1,pac,sat2,sat3,sat4,satA,sta,tlmB
    "class I SAM-dependent methyltransferase", # 855978 | seen: True | EBI-EMBL@NF013382@NA,EBI-EMBL@NF014414@NA,EBI-EMBL@NF015998@NA,EBI-EMBL@NF016340@NA,EBI-EMBL@NF018645@NA,EBI-EMBL@NF022137@NA,EBI-EMBL@NF024104@NA,EBI-EMBL@NF024967@NA,PRK@NF005379@NA | cmnU,emtA,kamB,kmr,llmA
    "glycosyltransferase", # 839719 | seen: False | EBI-EMBL@NF012744@NA,EBI-EMBL@NF012745@NA,EBI-EMBL@NF013652@NA,EBI-EMBL@NF015022@NA,EBI-EMBL@NF016027@NA,EBI-EMBL@NF016377@NA,EBI-EMBL@NF022760@NA,EBI-EMBL@NF024831@NA,EBI-EMBL@NF024868@NA,EBI-EMBL@NF024896@NA,EBI-EMBL@NF024914@NA,EBI-EMBL@NF024968@NA,EBI-EMBL@NF025025@NA,EBI-EMBL@NF025072@NA,EBI-EMBL@NF025089@NA,PRK@NF007482@NA,PRK@NF010008@NA | arnC,mgt,ole(D),rgt
    "methyltransferase domain-containing protein", # 193694 | seen: False | EBI-EMBL@NF019846@NA,EBI-EMBL@NF024880@NA,EBI-EMBL@NF025033@NA,EBI-EMBL@NF025217@NA | chrB,cmnU,emtA,erm(32),kamB,myrA
    "aminotransferase class I/II-fold pyridoxal phosphate-dependent enzyme", # 157230 | seen: False | EBI-EMBL@NF012382@NA | arnB
    "aminotransferase class V-fold PLP-dependent enzyme", # 143697 | seen: False | EBI-EMBL@NF012488@NA | cac
    "SAM-dependent methyltransferase", # 130830 | seen: False | EBI-EMBL@NF012799@NA,EBI-EMBL@NF013856@NA,EBI-EMBL@NF014673@NA,EBI-EMBL@NF016550@NA,EBI-EMBL@NF017238@NA,EBI-EMBL@NF024255@NA,JCVI@TIGR00027@NA | llmA
    "N-acetyltransferase", # 121631 | seen: False | EBI-EMBL@NF025895@NA,PRK@NF005840@NA,PRK@NF007527@yhhY,PRK@NF007853@NA,PRK@NF010241@NA | aac(2')-I,aac(3)-I,aac(3)-XI,aac(6')-I,cpaA,edeQ,nat1,pac,sat3,sat4,satA,sta
    "RNA methyltransferase", # 114889 | seen: True | EBI-EMBL@NF021455@NA,PRK@NF011034@NA | aviRb,tsnR
    "phosphotransferase", # 113695 | seen: False | EBI-EMBL@NF013775@NA | aph(2'')*,aph(2'')-III,aph(9)-I,ard2,cap21,capP,cph,cpr17,mph(I),mph(J),mph(K),mph(L),mph(N),vph
    "methyltransferase", # 110210 | seen: False | EBI-EMBL@NF013087@NA,EBI-EMBL@NF016107@NA,EBI-EMBL@NF017026@NA,EBI-EMBL@NF019847@NA,EBI-EMBL@NF025059@NA,PRK@NF010654@NA | emtA
    "lysophospholipid acyltransferase family protein", # 105525 | seen: False | PRK@NF006487@NA | almG
    "glycosyltransferase family 1 protein", # 97838 | seen: False | NA | rgt
    "nucleotidyltransferase family protein", # 96420 | seen: False | EBI-EMBL@NF017452@NA,EBI-EMBL@NF017820@NA,EBI-EMBL@NF026257@NA | lnu(C),lnu(D),lnu(P)
    "aminoglycoside phosphotransferase family protein", # 92559 | seen: True | EBI-EMBL@NF016533@NA | aph(2'')*,aph(2'')-I,aph(2'')-II,aph(4)-I,aph(6)-I,aph(7'')-I,aph(9)-I,cap21,capP,cph,cpr17,vph
    "site-specific DNA-methyltransferase", # 92537 | seen: False | EBI-EMBL@NF023978@NA | spcM
    "DegT/DnrJ/EryC1/StrS aminotransferase family protein", # 83451 | seen: False | NA | arnB
    "phosphotransferase family protein", # 76074 | seen: False | NA | aph(2'')-II,aph(2'')-III,aph(4)-I,aph(7'')-I,aph(9)-I,cph,vph
    "nucleotidyltransferase domain-containing protein", # 74424 | seen: False | EBI-EMBL@NF014021@NA,EBI-EMBL@NF022167@NA | ant(2'')-I,ant(4')-II,lnu(A),lnu(B),lnu(C),lnu(D),lnu(E),lnu(F),lnu(G),lnu(P)
    "50S ribosomal protein L11 methyltransferase", # 56424 | seen: False | EBI-EMBL@NF018076@NA,JCVI@TIGR00406@prmA,PRK@NF001784@NA,PRK@NF001785@prmA,PRK@NF001786@NA,PRK@NF001790@NA | aviRa
    "DNA methyltransferase", # 44188 | seen: False | EBI-EMBL@NF013704@NA,EBI-EMBL@NF042814@NA,PRK@NF010253@NA | spcM
    "acetyltransferase", # 38206 | seen: False | PRK@NF007807@NA,PRK@NF010621@NA | aac(6')*,vat(E),vat(F)
    "phosphoethanolamine transferase", # 32606 | seen: True | NA | eptC,icr,mcr-1,mcr-10,mcr-11,mcr-2,mcr-3,mcr-4,mcr-5,mcr-6,mcr-7,mcr-8,mcr-9
    "DegT/DnrJ/EryC1/StrS family aminotransferase", # 31449 | seen: False | EBI-EMBL@NF013227@NA | arnB
    "phosphotransferase enzyme family protein", # 30807 | seen: False | NA | aph(9)-I
    "nucleotidyltransferase", # 29868 | seen: False | EBI-EMBL@NF021487@NA,PRK@NF010191@NA,PRK@NF010192@NA | lnu(B),lnu(G)
    "xanthine phosphoribosyltransferase", # 19354 | seen: False | JCVI@TIGR01744@xpt,PRK@NF006613@gpt,PRK@NF006671@NA | tet(34)
    "TrmH family RNA methyltransferase", # 17392 | seen: False | EBI-EMBL@NF012797@NA | aviRb,nshR,tsnR,tsr
    "putative RNA methyltransferase", # 16368 | seen: False | EBI-EMBL@NF045339@NA | chrB,erm(32),myrA
    "N-acetyltransferase family protein", # 14067 | seen: False | NA | aac(3)-I,aac(3)-XI,nat1,sta
    "peptidoglycan bridge formation glycyltransferase FemA/FemB family protein", # 13745 | seen: False | EBI-EMBL@NF014445@NA | vanK
    "phosphatidylglycerol lysyltransferase domain-containing protein", # 12515 | seen: False | EBI-EMBL@NF021445@NA | mprF
    "aminoacyltransferase", # 6754 | seen: False | NA | vanK
    "glutathione transferase", # 5724 | seen: False | PRK@NF011693@yfcF | fos*,fosA,fosC,fosL
    "lipid A biosynthesis acyltransferase", # 4435 | seen: False | NA | almG
    "aminoglycoside adenylyltransferase domain-containing protein", # 4291 | seen: False | EBI-EMBL@NF024819@NA | ant(3'')-I,ant(3'')-II,ant(9)*,ant(9)-I
    "aminoglycoside adenylyltransferase family protein", # 1896 | seen: False | PRK@NF010309@NA | ant(3'')-I,ant(9)*,ant(9)-I
    "lipid II:glycine glycyltransferase FemX", # 1730 | seen: False | NA | vanK
    "peptidoglycan bridge formation glycyltransferase FemX", # 594 | seen: False | NA | vanK
    "peptidoglycan bridge formation glycyltransferase FemY", # 13 | seen: False | NA | vanK

    
    ## family
    "VOC family protein", # 821665 | seen: False | EBI-EMBL@NF013098@NA,EBI-EMBL@NF017949@NA,EBI-EMBL@NF018663@NA,EBI-EMBL@NF024859@NA,EBI-EMBL@NF025050@NA,EBI-EMBL@NF037170@NA,PRK@NF008551@NA,PRK@NF008678@NA,PRK@NF008679@NA,PRK@NF008680@NA,PRK@NF008681@NA,PRK@NF008682@NA,PRK@NF008683@NA | ble*,bleO,blmA,fos*,fosA,fosB,fosC,fosD,fosE,fosF,fosG,fosH,fosI,fosK,fosL,fosM,fosU,fosX,fosY,tlmA,zbmA
    "AAA family ATPase", # 740653 | seen: True | EBI-EMBL@NF012234@NA,EBI-EMBL@NF019344@NA,EBI-EMBL@NF019346@NA,EBI-EMBL@NF019348@NA,EBI-EMBL@NF021349@NA,EBI-EMBL@NF024565@NA,EBI-EMBL@NF024571@NA,EBI-EMBL@NF024573@NA,EBI-EMBL@NF024589@NA,EBI-EMBL@NF024605@NA,EBI-EMBL@NF024635@NA,EBI-EMBL@NF024642@NA,EBI-EMBL@NF024700@NA,EBI-EMBL@NF024793@NA,EBI-EMBL@NF024867@NA,EBI-EMBL@NF024870@NA,EBI-EMBL@NF024872@NA,EBI-EMBL@NF024891@NA,EBI-EMBL@NF024904@NA,EBI-EMBL@NF024911@NA,EBI-EMBL@NF024991@NA,EBI-EMBL@NF025001@NA,EBI-EMBL@NF025038@NA,EBI-EMBL@NF025052@NA,PRK@NF009880@NA,PRK@NF009881@NA,PRK@NF009882@NA,PRK@NF009883@NA | aph(2'')*,ard2,cpt,helR,tmrB,tmrD
    "MerR family transcriptional regulator", # 324261 | seen: True | EBI-EMBL@NF024803@NA | tipA
    "phosphatase PAP2 family protein", # 213571 | seen: False | EBI-EMBL@NF013718@NA,EBI-EMBL@NF025736@NA | bcrC,lpxE,lpxF
    "polysaccharide deacetylase family protein", # 212191 | seen: False | EBI-EMBL@NF013672@NA | arnD
    "endonuclease/exonuclease/phosphatase family protein", # 191136 | seen: False | EBI-EMBL@NF015338@NA,PRK@NF003839@NA,PRK@NF003840@NA,PRK@NF003841@NA,PRK@NF003842@NA | vanJ
    "TolC family protein", # 186658 | seen: False | EBI-EMBL@NF014385@NA | adeH,axyM,bepC,cmeC,cmeD,eefC,emhC,ftlC,hefA,mdsC,mdtP,mdtQ,mtrE,opmD,opmE,oprC,oprN,smeF,tolC
    "pyridoxamine 5'-phosphate oxidase family protein", # 176604 | seen: False | EBI-EMBL@NF013412@NA,EBI-EMBL@NF024304@NA,EBI-EMBL@NF027567@NA | nimA,nimB,nimC,nimD,nimE,nimF,nimG,nimH,nimI,nimJ,nimK,nimL
    "ABC-F family ATP-binding cassette domain-containing protein", # 172245 | seen: False | NA | ard1,car(A),cmpA,lmr(C),ole(B),optrA,poxtA,srm(B),taeA,tlr(C),vmlR
    "FtsX-like permease family protein", # 149771 | seen: True | EBI-EMBL@NF014717@NA | anrB,bceB,derB,nsrP,psdB,rapB,vraE,ysaB,yxdM
    "flavin reductase family protein", # 133318 | seen: False | NA | sulR
    "S41 family peptidase", # 130753 | seen: True | EBI-EMBL@NF015531@NA | nsr
    "D-alanyl-D-alanine carboxypeptidase family protein", # 95875 | seen: False | EBI-EMBL@NF014600@NA | vanXY
    "isochorismatase family protein", # 70335 | seen: False | EBI-EMBL@NF013053@NA | sttH
    "M15 family metallopeptidase", # 67688 | seen: False | EBI-EMBL@NF013586@NA,EBI-EMBL@NF024929@NA | vanX,vanXY
    "Na+/H+ antiporter NhaC family protein", # 45156 | seen: False | EBI-EMBL@NF015515@NA | tet(35)
    "GNAT family protein", # 32099 | seen: False | NA | aac(6')*,aac(6')-I,cpaA
    "D-alanine--D-alanine ligase family protein", # 20093 | seen: False | NA | vanA,vanB,vanC,vanD,vanE,vanF,vanG,vanI,vanL,vanM,vanN,vanO,vanP
    "zeta toxin family protein", # 15933 | seen: False | EBI-EMBL@NF018154@NA | aph(2'')*,ard2
    "translation factor GTPase family protein", # 15750 | seen: False | NA | otr(A),tet(32),tet(44),tet(M),tet(O),tet(S),tet(W),tetB(P)
    "Yip1 family protein", # 15184 | seen: False | NA | bahA

    
    ## binding
    "ATP-binding protein", # 1274011 | seen: True | EBI-EMBL@NF013260@NA,EBI-EMBL@NF013348@NA,EBI-EMBL@NF013776@NA,EBI-EMBL@NF013828@NA,EBI-EMBL@NF014567@NA,EBI-EMBL@NF023550@NA,EBI-EMBL@NF024254@NA,EBI-EMBL@NF024944@NA,EBI-EMBL@NF024970@NA,EBI-EMBL@NF024977@NA,EBI-EMBL@NF025124@NA,PRK@NF005304@NA,PRK@NF009043@NA | aph(2'')*,ard2,tmrB
    "ATP-binding cassette domain-containing protein", # 537972 | seen: False | EBI-EMBL@NF012235@NA | anrA,ard1,aviABC-1,bceA,bcrA,car(A),cmpA,cprA,derA,expB,lieA,lmr(C),lnrL,lsa(A),lsa(B),lsa(C),lsa(E),macB,msr(A),narA,nsrF,ole(B),ole(C)-1,optrA,otr(C)-1,psdA,ranA,rapA,smrA,srm(B),taeA,tetA(58),tetB,tlr(C),tnrB-1,tva(A),tva(B),tva(C),tva(D),vcaM,vga(F),vraD,ysaC,yxdL
    "AMP-binding protein", # 302180 | seen: False | EBI-EMBL@NF012711@NA | almE
    "GTP-binding protein", # 104608 | seen: False | EBI-EMBL@NF012239@NA,EBI-EMBL@NF014542@NA,EBI-EMBL@NF019303@NA,JCVI@TIGR00231@NA,JCVI@TIGR00991@NA | otr(A),tet(36),tet(Q)
    "penicillin-binding transpeptidase domain-containing protein", # 60389 | seen: False | EBI-EMBL@NF013100@NA | blaBAT,blaBPU,blaBSU,blaCDD,blaLCR/NPS,blaOXA,blaRSD1,blaRSD2
    "PEP/pyruvate-binding domain-containing protein", # 22564 | seen: False | EBI-EMBL@NF013491@NA | rphB,rphC,rphD
    "nucleotide disphospho-sugar-binding domain-containing protein", # 11445 | seen: False | EBI-EMBL@NF018434@NA | mgt,ole(D),rgt
    "biotin/lipoyl-binding protein", # 7613 | seen: False | EBI-EMBL@NF024923@NA | emrA,mdtN,vceA,vmeU
    "putative solute-binding protein", # 4467 | seen: False | EBI-EMBL@NF039651@NA | adeT
    "UDP binding domain-containing protein", # 1617 | seen: False | EBI-EMBL@NF015665@NA | ugd

    
    ## containing
    "pentapeptide repeat-containing protein", # 123512 | seen: True | EBI-EMBL@NF013003@NA,EBI-EMBL@NF013623@NA,EBI-EMBL@NF024965@NA,EBI-EMBL@NF024986@NA | albG,mfpA,qnr*,qnrB,qnrE
    "UvrD-helicase domain-containing protein", # 103449 | seen: False | EBI-EMBL@NF012789@NA | helR
    "FG-GAP-like repeat-containing protein", # 29373 | seen: False | EBI-EMBL@NF024907@NA | fusH
    "JAB domain-containing protein", # 9622 | seen: False | EBI-EMBL@NF015935@NA | lnu(B),lnu(G)
    "FBP domain-containing protein", # 7926 | seen: False | EBI-EMBL@NF027885@NA | fusC
    "DUF2304 domain-containing protein", # 7759 | seen: False | NA | bahA
    "flippase-like domain-containing protein", # 3097 | seen: False | JCVI@TIGR00374@NA | mprF
    "THUMP domain-containing protein", # 2758 | seen: False | EBI-EMBL@NF014923@NA | emtA
    "cysteine-rich KTR domain-containing protein", # 2087 | seen: False | EBI-EMBL@NF025568@NA | cfr(C),cfr*
    "phosphoethanolamine transferase domain-containing protein", # 282 | seen: False | EBI-EMBL@NF019631@NA | icr,mcr-10,mcr-11,mcr-3,mcr-7,mcr-9

    
    ## other
    "radical SAM protein", # 117186 | seen: False | EBI-EMBL@NF015983@NA,NCBIFAM@NF045502@NA | cfr(A),clbC
    "alanine racemase", # 116238 | seen: False | EBI-EMBL@NF013345@NA,JCVI@TIGR00492@alr,PRK@NF000791@NA,PRK@NF000792@NA,PRK@NF009879@NA | vanT
    "isoleucine--tRNA ligase", # 89985 | seen: False | JCVI@TIGR00392@ileS | mupA,mupB
    "dihydropteroate synthase", # 80800 | seen: False | EBI-EMBL@NF013007@NA,JCVI@TIGR01496@folP,PRK@NF008625@folP | sul1,sul2,sul3,sul4
    "nucleotide exchange factor GrpE", # 62403 | seen: False | EBI-EMBL@NF013212@grpE,PRK@NF007655@grpE,PRK@NF010737@grpE,PRK@NF010738@grpE,PRK@NF010739@grpE,PRK@NF010740@grpE,PRK@NF010741@grpE,PRK@NF010742@grpE,PRK@NF010743@grpE,PRK@NF010744@grpE,PRK@NF010745@grpE,PRK@NF010746@grpE,PRK@NF010747@grpE,PRK@NF010748@grpE,PRK@NF010749@grpE,PRK@NF010750@grpE,PRK@NF010751@grpE,PRK@NF010752@grpE,PRK@NF010753@grpE,PRK@NF010754@grpE,PRK@NF010755@grpE,PRK@NF010756@grpE,PRK@NF010757@grpE,PRK@NF010758@grpE,PRK@NF010759@grpE,PRK@NF010760@grpE,PRK@NF010761@grpE | muxA
    "N-6 DNA methylase", # 54433 | seen: False | EBI-EMBL@NF014442@NA | aviRa
    "dihydrofolate reductase", # 51400 | seen: False | EBI-EMBL@NF012413@NA | dfrA,dfrC,dfrD,dfrE,dfrF,dfrG,dfrI,dfrK,dfrL
    "DapH/DapD/GlmU-related protein", # 45770 | seen: False | EBI-EMBL@NF025953@NA | catB,catC
    "trypsin-like serine protease", # 36639 | seen: False | EBI-EMBL@NF012318@NA | fusH
    "phosphoenolpyruvate synthase", # 33792 | seen: True | PRK@NF004877@NA,PRK@NF004878@NA,PRK@NF004879@ppsA,PRK@NF004880@NA,PRK@NF004881@NA,PRK@NF004882@NA,PRK@NF004883@NA,PRK@NF005057@ppsA | rphA,rphB,rphC,rphD
    "lipoprotein", # 33729 | seen: False | EBI-EMBL@NF013131@NA,EBI-EMBL@NF019749@NA,EBI-EMBL@NF025012@NA,EBI-EMBL@NF039509@NA,NCBIFAM@NF033168@NA,NCBIFAM@NF033169@NA,NCBIFAM@NF038029@NA,PRK@NF007824@NA,PRK@NF007894@NA,PRK@NF008518@NA | acrA,acrE,adeI,aheA,amrA,axyA,bpeA,eefA,emhA,mexA,mtrC,sdeX,smeD,ttgA
    "adenylyl-sulfate kinase", # 32474 | seen: False | EBI-EMBL@NF013730@NA,JCVI@TIGR00455@cysC,PRK@NF002059@NA,PRK@NF003013@NA,PRK@NF004041@NA | tmrB
    "chemotaxis protein", # 25131 | seen: False | PRK@NF009439@NA,PRK@NF009440@NA,PRK@NF009441@NA | cpt
    "flavin reductase", # 16739 | seen: False | EBI-EMBL@NF013756@NA | sulR
    "type 3 dihydrofolate reductase", # 5830 | seen: False | PRK@NF008037@folA | dfrA,dfrC,dfrD,dfrE,dfrF,dfrG,dfrI,dfrK
    "ATPase AAA", # 1075 | seen: False | NA | helR
}

In [7]:
tmp = pd.DataFrame.from_dict(counter, orient='index').sort_values(0, ascending=False)
tmp = tmp[~tmp.index.isin(nset)]
print('subtype:', len(tmp), 'n:', tmp[0].sum())

for i in tmp[(tmp.index.str.contains('.*')) & (tmp[0]>10000)].index:
    print(f'"{i}", # {counter.get(i)} | seen: {i in nr} | {hmm.get(i, "NA")} | {header2subtype.get(i, "NA")}')

subtype: 6593 n: 824181
"undecaprenyl-diphosphate phosphatase", # 65426 | seen: True | EBI-EMBL@NF014703@NA,PRK@NF001388@bacA,PRK@NF001389@NA,PRK@NF001390@NA,PRK@NF001391@NA,PRK@NF001392@NA,PRK@NF001393@NA,PRK@NF001394@NA,PRK@NF001395@NA,PRK@NF001396@NA,PRK@NF001397@NA,PRK@NF001398@NA,PRK@NF008813@ybjG | bacA,bcrD
"TolC family outer membrane protein", # 40115 | seen: False | JCVI@TIGR01844@NA | bepC,hdrC,opmH,tolC,vpoC
"ribosomal protection-like ABC-F family protein", # 30409 | seen: True | NCBIFAM@NF000355@abc-f | ard1,car(A),cmpA,cplR,lmr(C),lsa(A),lsa(B),lsa(C),lsa(D),lsa(E),msr(A),msr(C),msr(D),msr(E),msr(F),msr(G),msr(H),msr(I),ole(B),optrA,poxtA,sal(D),sal(E),srm(B),taeA,tlr(C),tva(A),tva(B),tva(C),tva(D),vga(A),vga(B),vga(C),vga(D),vga(E),vga(F),vga(G),vmlR
"CatB-related O-acetyltransferase", # 23079 | seen: False | NA | apmA,catB,catC,vat(A),vat(B),vat(C),vat(D),vat(E),vat(F),vat(H),vat(I)
"bifunctional lysylphosphatidylglycerol flippase/synthetase MprF", # 22882 | seen: True |

In [8]:
ids = {x.id for x in records if x.description.split(' >')[0].split(' ',1)[1].rsplit(' [')[0].split('MULTISPECIES: ')[-1] not in nset}
with open('tmp/sarg_raw.fa', 'w') as output_handle:
    SeqIO.write([x for x in records if x.description.split(' >')[0].split(' ',1)[1].rsplit(' [')[0].split('MULTISPECIES: ')[-1] not in nset], output_handle, 'fasta')

In [9]:
%%bash
rm -rf remark
mkdir -p remark

In [10]:
ff = len(ids) // 10000 + 1
folds = [list(ids)[i::ff] for i in range(ff)]
for sp, xx in enumerate(folds):
    zz = []
    handle = Entrez.efetch(db="protein", id=','.join(xx), rettype = 'gbwithparts', retmode = "text") 
    for record in SeqIO.parse(handle, "genbank"):
        if 'structured_comment' in record.annotations:
            evidence = record.annotations['structured_comment']['Evidence-For-Name-Assignment']

            if 'Source Identifier' in evidence:
                source = evidence['Source Identifier']
            else:
                source = 'NA'
                
            if 'Evidence Accession' in evidence:
                evidence = evidence['Evidence Accession']
            else:
                evidence = 'NA'
        else:
            evidence = 'NA'

        header = record.description.rsplit(' [')[0].split('MULTISPECIES: ')[-1]
        gene = [x for x in record.features if x.type == 'gene']
        if gene:
            g = gene[0].qualifiers['gene'][0]
        else:
            g = 'NA'

        taxonomy = ';'.join(record.annotations['taxonomy'])
        zz.append([record.id, header, taxonomy, record.annotations['organism'], evidence, g, source])

    handle.close()
    t = time.strftime("%m-%d-%H-%M-%S", time.localtime())
    pd.DataFrame(zz).to_csv(f'remark/remark{sp}_{t}.tsv', sep='\t', header=None, index=False)

pd.concat(pd.read_table(x, header=None) for x in glob.glob('remark/*.tsv')).to_csv('tmp/remark.tsv', sep='\t', index=False, header=None)