In [None]:
# %%bash
# diamond blastp -q init.fasta \
#     -d refseq_protein.dmnd --out refseq_protein50.txt \
#     --outfmt 6 qseqid sseqid stitle nident qlen slen pident qcovhsp scovhsp bitscore evalue \
#     --id 50 --subject-cover 75 --query-cover 75 \
#     -k 0 --threads 48 --quiet

In [None]:
from Bio import SeqIO
import pandas as pd

nr = set()
with open('sarg2nr.fa') as handle:
    for record in SeqIO.parse(handle, 'fasta'):
        nr.add(record.description.split(' ',1)[1].split(' >')[0].rsplit(' [')[0].split('MULTISPECIES: ')[-1])

In [None]:
records = set()
with open('refseq_protein50.txt') as f:
    for line in f:
        ls = line.rstrip().split('\t')
        qseqid = ls[0].split('|')
        family, gene = qseqid[1], qseqid[2]
        id, header =  ls[1], ls[2].split(' >')[0].split(' ',1)[1].rsplit(' [')[0].split('MULTISPECIES: ')[-1]
        if 'hypothetical' not in header and ', partial' not in header:
            records.add((family, gene, header, id))

In [None]:
dxx = pd.DataFrame(records, columns = ['family', 'gene', 'header', 'id'])
header2gene = dxx.groupby('header').gene.apply(lambda x: ','.join(set(x))).to_dict()

In [None]:
negative = {

    ## PFAM | SPARCLE only
    ## https://www.ncbi.nlm.nih.gov/protfam/?term=%22TolC+family+protein%22
    "TolC family protein", # 186095 | seen: False | mtrE,mdtP,opcM,cmeD,adeH,hefA,mdtQ,adeK,mdsC,aheC,axyM,opmD,bepC,oprB,oprN,adeC,emhC,opmE,eefC,cmeC,ttgC

    ## https://www.ncbi.nlm.nih.gov/protfam/?term=%22aminoglycoside+phosphotransferase+family+protein%22
    "aminoglycoside phosphotransferase family protein", # 85307 | seen: True | aph(4)-I,aph(2'')-III,aph(6)-I,cph,vph,aph(2'')-I,aph(2'')-II,aph(7'')-I,aph(9)-I,aph(6)*

    ## https://www.ncbi.nlm.nih.gov/protfam/?term=%22CrcB+family+protein%22
    "CrcB family protein", # 23495 | seen: False | crcB

    ## https://www.ncbi.nlm.nih.gov/protfam/?term=%22D-alanyl-D-alanine+carboxypeptidase+family+protein%22
    "D-alanyl-D-alanine carboxypeptidase family protein", # 63377 | seen: False | vanXY

    ## https://www.ncbi.nlm.nih.gov/protfam/?term=%22UDP-glucose%2FGDP-mannose+dehydrogenase+family+protein%22
    "UDP-glucose/GDP-mannose dehydrogenase family protein", # 49001 | seen: False | ugd

    ## https://www.ncbi.nlm.nih.gov/protfam/?term=%22CatB-related+O-acetyltransferase%22
    "CatB-related O-acetyltransferase", # 21314 | seen: False | apmA,catB,vat(A),vat(D),vat(C),vat(F),vat(E),vat(H),catC,vat(B),vat(I)

    ## https://www.ncbi.nlm.nih.gov/protfam/?term=%22erythromycin+esterase+family+protein%22
    "erythromycin esterase family protein", # 18957 | seen: False | ere(B),ere(A)

    ## https://www.ncbi.nlm.nih.gov/protfam/?term=%22D-alanine--D-alanine+ligase+family+protein%22
    "D-alanine--D-alanine ligase family protein", # 18218 | seen: False | vanI,vanA,vanP,vanG,vanF,vanC,vanB,vanD,vanO,vanE,vanL,vanN,vanM

    ##  https://www.ncbi.nlm.nih.gov/protfam/?term=%22AAC(3)%20family%20N-acetyltransferase%22
    "AAC(3) family N-acetyltransferase", # 10096 | seen: False | aac(3)-IV

    
    ## unnamed
    "unnamed protein product",

    ## transporters
    "AbgT family transporter",
    "AbgT family antimetabolite efflux transporter",
    "EamA family transporter", # 152679
    "PACE efflux transporter",
    "SMR family transporter",
    "MDR family MFS transporter",
    "MMPL family transporter",
    "MMPL family RND transporter", # 18 | seen: False | mmpL5
    "RND family transporter", # 8571
    "MATE family efflux transporter",
    "AI-2E family transporter", # 166716 | seen: False | cxpE

    "MFS transporter",
    "ABC transporter ATP-binding protein/permease",
    "ABC transporter ATP-binding protein",
    "ABC transporter permease",
    "ABC transporter permease subunit",
    "ABC transporter transmembrane domain-containing protein",
    "ABC-F family ATP-binding cassette domain-containing protein",
    "ABC-F type ribosomal protection protein",
    "ABC-2 transporter permease",
    "export ABC transporter ATP-binding protein", # 9 | seen: False | lieA
    "multidrug efflux SMR transporter",
    "multidrug efflux RND transporter permease subunit",
    "multidrug efflux MFS transporter",
    "multidrug effflux MFS transporter",
    "multidrug ABC transporter permease/ATP-binding protein",
    "multidrug efflux ABC transporter ATP-binding protein", # 46 | seen: False | narA
    "efflux RND transporter periplasmic adaptor subunit",
    "efflux RND transporter permease subunit",
    "efflux transporter outer membrane subunit",
    "HlyD family efflux transporter periplasmic adaptor subunit",
    "HlyD family secretion protein",
    "DHA2 family efflux MFS transporter permease subunit",
    "membrane protein", # 30647
    "DoxX family membrane protein", # 19727
    "YbaL family putative K(+) efflux transporter", # 8371 | seen: False | rosB
    "tripartite tricarboxylate transporter TctB family protein", # 45426 | seen: False | bahA
    "antibiotic transporter", # 94 | seen: False | drrB
    "antibiotic ABC transporter permease", # 326 | seen: False | drrB
    "siderophore RND transporter MmpL4", # 132 | seen: False | mmpL5
    "RND transporter MmpL1", # 156 | seen: False | mmpL5
    "RND transporter MmpL2", # 154 | seen: False | mmpL5
    "RND transporter MmpL9", # 139 | seen: False | mmpL5

    
    ## resistance vs sensitive
    "dihydrofolate reductase",
    "type 3 dihydrofolate reductase",
    "dihydropteroate synthase",
    "phosphoenolpyruvate synthase",
    "isoleucine--tRNA ligase", # 83290
    
    ## binding
    "ATP-binding protein",
    "AMP-binding protein", # 314446
    "GTP-binding protein",
    "FAD-binding protein",
    "putative solute-binding protein", # 4157
    "biotin/lipoyl-binding protein", # 16970 | seen: False | mdtN,vmeU,vceA,emrA
    "elongation factor G-binding protein", # 674 | seen: False | fusD,fusF,fusC,fusB

    ## containing
    "UvrD-helicase domain-containing protein", # 118545
    "JAB domain-containing protein", # 12754
    "THUMP domain-containing protein", # 8006
    "DUF2304 domain-containing protein", # 7016
    "FBP domain-containing protein", # 6773
    "methyltransferase domain-containing protein", # 221147
    "alpha/beta hydrolase fold domain-containing protein", # 16899
    "ATP-binding cassette domain-containing protein",
    "nucleotidyltransferase domain-containing protein", # 82579 | seen: False | lnu(F),lnu(B),ant(4')-II,lnu(G)
    "penicillin-binding transpeptidase domain-containing protein", # 63879 | seen: False | blaBPU,blaBAT,blaRSD2,blaOXA,blaRSD1,blaBSU,blaLCR/NPS,blaCDD
    "GMC family oxidoreductase N-terminal domain-containing protein", # 47234 | seen: True | capO
    "PEP/pyruvate-binding domain-containing protein", # 19412 | seen: False | rphC,rphB,rphD
    "nucleotide disphospho-sugar-binding domain-containing protein", # 15778 | seen: False | mgt,rgt,ole(D)
    "aminoglycoside adenylyltransferase domain-containing protein", # 3661 | seen: False | ant(3'')-II,ant(9)*,ant(9)-I,ant(3'')-I
    "flippase-like domain-containing protein", # 2921 | seen: False | mprF
    "cysteine-rich KTR domain-containing protein", # 1916 | seen: False | cfr*,cfr(C)
    "UDP binding domain-containing protein", # 1407 | seen: False | ugd
    "phosphoethanolamine transferase domain-containing protein", # 289 | seen: False | mcr-7,mcr-10,mcr-3,mcr-9
    "kanamycin nucleotidyltransferase C-terminal domain-containing protein", # 259 | seen: False | ant(4')-I
    "oxytetracycline resistance phosphoribosyltransferase domain-containing protein Tet(34)", # 139 | seen: True | tet(34)
    "pentapeptide repeat-containing protein", # 111034 | seen: False | mfpA,qnrE,qnrB,qnr*
    "FG-GAP-like repeat-containing protein", # 22607 | seen: False | fusH


    ## family
    "GNAT family protein",
    "VOC family protein",
    "AAA family ATPase",
    "DoxX family protein",
    "zeta toxin family protein", # 14227
    "S41 family peptidase",
    "S1 family peptidase", # 21246
    "Yip1 family protein", # 13972
    "DUF2705 family protein", # 845
    "YrzE family protein", # 759
    "SLC13 family permease", # 91126 | seen: False | mtrF
    "isopentenyl phosphate kinase family protein", # 19 | seen: False | fomA


    ## hydrolase/reductase/monooxygenase
    "alpha/beta hydrolase", # 1208752
    "serine hydrolase", # 161930
    "MBL fold metallo-hydrolase", # 555439 | seen: False | blaEFM,blaNWM,blaECM,varG,blaCRD3,blaSIE,cphA,blaCAR,blaLMB,blaEVM,blaELM,blaTHIN,blaEAM,blaB3SU1
    "dienelactone hydrolase family protein", # 103239 | seen: False | albD

    "NAD(P)/FAD-dependent oxidoreductase", # 485079 | seen: False | tet(X)
    "NAD(P)-dependent oxidoreductase", # 253434 | seen: False | vanH
    "FAD-dependent oxidoreductase", # 630664 | seen: False | iri,rox
    "SDR family oxidoreductase",

    "FAD-dependent monooxygenase", # 133533 | seen: False | rox,tet(51),tet(47),iri,tet(54),tet(55),tet(X),tet(49),tet(50),tet(56),tet(48)


    ## *transferase
    "N-acetyltransferase", # 87720
    "GNAT family N-acetyltransferase",

    "class I SAM-dependent methyltransferase", # 892945
    "SAM-dependent methyltransferase", # 122975
    "methyltransferase",
    "RNA methyltransferase",
    "putative RNA methyltransferase", # 14391
    "rRNA methyltransferase",
    "23S rRNA methyltransferase",
    "16S rRNA (cytidine(1402)-2'-O)-methyltransferase",
    "TrmH family RNA methyltransferase", # 15558 | seen: False | aviRb,tsnR,nshR
    "50S ribosomal protein L11 methyltransferase", # 58778
    "rRNA adenine N-6-methyltransferase family protein", # 5013 | seen: False | erm(45),erm(40),erm(37),erm(33),erm(G),erm(55),erm(39),erm(O),erm(C),erm(T),erm(V),erm(B),erm(46),erm(35),erm(W),erm(49),erm(F),erm(Y),erm(54),erm(31),erm(41)

    "acetyltransferase", # 37542 | seen: False | aac(6')*,vat(I),vat(F),vat(E)
    "antibiotic acetyltransferase", # 109 | seen: False | vat(E)
    "phosphotransferase", # 135409 | seen: False | aph(4)-I,aph(2'')-III,mph(K),cph,aph(3'')-I,vph,mph(J),mph(I),aph(9)-I
    "glycosyltransferase", # 797725 | seen: False | mgt,rgt,ole(D),arnC
    "glycosyltransferase family 1 protein", # 97033 | seen: False | rgt
    "glycosyl transferase", # 23457 | seen: False | mgt
    "nucleotidyltransferase", # 28424 | seen: False | lnu(G),lnu(B)
    "nucleotidyltransferase family protein", # 93474 | seen: False | lnu(D),lnu(P),lnu(C)
    "xanthine phosphoribosyltransferase", # 18612 | seen: False | tet(34)
    "phosphoethanolamine transferase", # 17345 | seen: False | mcr-1,mcr-10,mcr-5,mcr-2,mcr-3,mcr-6,eptC,mcr-9,mcr-7,icr
    "glutathione transferase", # 5428 | seen: False | fosC,fosL,fos*,fosA
    "aminoacyltransferase", # 6102 | seen: False | vanK

    
    ## others
    "acyl-CoA dehydrogenase family protein", # 500171 | seen: False | sulX
    "aminotransferase class I/II-fold pyridoxal phosphate-dependent enzyme", # 197872 | seen: False | arnB
    "polysaccharide deacetylase family protein", # 192991 | seen: False | arnD
    "phosphatase PAP2 family protein", # 188346 | seen: False | lpxF,bcrC,lpxE
    "lysophospholipid acyltransferase family protein", # 172009 | seen: False | almG
    "endonuclease/exonuclease/phosphatase family protein", # 163544 | seen: False | vanJ
    "FtsX-like permease family protein", # 163053 | seen: False | psdB,bceB,yxdM,vraE
    "pyridoxamine 5'-phosphate oxidase family protein", # 153260 | seen: False | nimG,nimE,nimF,nimA,nimB,nimC,nimD,nimL,nimJ,nimH,nimI,nimK
    "radical SAM protein", # 136843 | seen: False | cfr(A),clbA,cfr(B),cfr(E),clbC
    "flavin reductase family protein", # 114712 | seen: False | sulR
    "alanine racemase", # 104361 | seen: False | vanT

    "DegT/DnrJ/EryC1/StrS family aminotransferase", # 80955 | seen: False | arnB
    "M15 family metallopeptidase", # 71616 | seen: False | vanX,vanXY
    "nucleotide exchange factor GrpE", # 56838 | seen: False | muxA
    "peptidoglycan bridge formation glycyltransferase FemA/FemB family protein", # 11558 | seen: False | vanK
    "peptidoglycan bridge formation glycyltransferase FemY", # 12 | seen: False | vanK
    "peptidoglycan bridge formation glycyltransferase FemX", # 491 | seen: False | vanK

    "phospho-N-acetylmuramoyl-pentapeptide-transferase", # 46189 | seen: False | mtrD,sdeY
    "nucleotide sugar dehydrogenase", # 60445 | seen: False | ugd
    "Na+/H+ antiporter NhaC family protein", # 49210 | seen: False | tet(35)

    "lipoprotein", # 29143 | seen: False | eefA,amrA,acrE,mexA,smeD,emhA,acrA,adeI,bpeA,sdeX,ttgA,aheA,axyA,smeA,mtrC
    "trypsin-like serine protease", # 27571 | seen: False | fusH
    "chemotaxis protein", # 24858 | seen: False | cpt
    "DapH/DapD/GlmU-related protein", # 24373 | seen: False | catB,catC
    "DegT/DnrJ/EryC1/StrS aminotransferase family protein", # 14264 | seen: False | arnB
    "monovalent cation:proton antiporter-2 (CPA2) family protein", # 20135 | seen: False | rosB
    "cation:proton antiporter", # 111507 | seen: False | rosB
    "N-6 DNA methylase", # 62569
    "D-isomer specific 2-hydroxyacid dehydrogenase family protein", # 6000 | seen: False | vanH
    "ATPase AAA", # 1156 | seen: False | helR
    "K5 polysaccharide biosynthesis UDP-glucose dehydrogenase KfiD", # 21 | seen: False | ugd
    "D-alanine--D-alanine ligase", # 58730 | seen: False | vanG

}
aset = {x for x in set(dxx['header']) if x not in negative}
print(len(aset))

In [None]:
from Bio import SeqIO
import glob
from tqdm.contrib.concurrent import process_map

def parse_file(file):
    a = []
    with open(file) as handle:
        for record in SeqIO.parse(handle, 'fasta'):
            if ', partial' not in record.description:
                id = record.description.split(' >')[0].split(' ',1)[1].rsplit(' [')[0].split('MULTISPECIES: ')[-1]
                if id in aset:
                    a.append(record)
    return(a)
r = process_map(parse_file, glob.glob('refseq_protein.full.fa.split/*.fa'), max_workers=20)

In [None]:
tmp = pd.DataFrame([x.description.split(' ',1)[1].split(' >')[0].rsplit(' [')[0].split('MULTISPECIES: ')[-1] for a in r for x in a]).groupby(0).size().sort_values(ascending=False)
header2num = tmp.to_dict()

ids = [x.id for a in r for x in a]
records = [x for a in r for x in a]
with open('sarg_raw.fasta', 'w') as output_handle:
    SeqIO.write(records, output_handle, 'fasta')

In [None]:
for i in tmp[tmp>100].index:
    print(f'"{i}", # {header2num.get(i)} | seen: {i in nr} | {header2gene.get(i)}')

In [None]:
%%bash
# rm -rf remark
mkdir -p remark

In [None]:
from Bio import Entrez
from tqdm import tqdm
import time
import glob


ok = []
ff = len(ids) // 10000 + 1
folds = [list(ids)[i::ff] for i in range(ff)]
for sp, xx in enumerate(tqdm(folds)):
    zz = []
    print(sp)
    handle = Entrez.efetch(db="protein", id=','.join(xx), rettype = 'gbwithparts', retmode = "text") 
    for record in SeqIO.parse(handle, "genbank"):
        if 'structured_comment' in record.annotations:
            evidence = record.annotations['structured_comment']['Evidence-For-Name-Assignment']

            if 'Source Identifier' in evidence:
                source = evidence['Source Identifier']
            else:
                source = 'NA'
                
            if 'Evidence Accession' in evidence:
                evidence = evidence['Evidence Accession']
            else:
                evidence = 'NA'
        else:
            evidence = 'NA'

        header = record.description.rsplit(' [')[0].split('MULTISPECIES: ')[-1]
        gene = [x for x in record.features if x.type == 'gene']
        if gene:
            g = gene[0].qualifiers['gene'][0]
        else:
            g = 'NA'

        taxonomy = ';'.join(record.annotations['taxonomy'])
        zz.append([record.id, header, taxonomy, record.annotations['organism'], evidence, g, source])

    handle.close()
    t = time.strftime("%m-%d-%H-%M-%S", time.localtime())
    pd.DataFrame(zz).to_csv(f'remark/remark_{sp}_{t}.tsv', sep='\t', header=None, index=False)
pd.concat(pd.read_table(x, header=None) for x in glob.glob('remark/*.tsv')).to_csv('remark.tsv', sep='\t', index=False, header=None)