# 获取初始的文章IDs
```key
wget https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/edirect-install.sh
sh edirect-install.sh
export PATH=${PATH}:${HOME}/edirect
esearch -db pubmed -query "Envelope Proteins" OR " Envelope genetics" | efetch -format  uid
esearch -db pubmed -query '"Viral Fusion Proteins"[nm]' | efetch -format  uid > pubmed_id.txt
```

In [28]:
from Bio import Entrez
import itertools
import xml.etree.cElementTree as ET
import tqdm
import jsonlines
import traceback
Entrez.email = "johntr@gmail.com"


def parserArticleidlist(articleidlist):

    d = {}
    if not articleidlist:
        return d
    for x in articleidlist.findall(path="ArticleId"):
        d[x.get("IdType")] = x.text
    return d

def fetch(pubids):
    handler = Entrez.efetch(db="pubmed", id=",".join(pubids))
    record = ET.parse(handler)
#     xml_str = ET.tostring(record.getroot(), encoding="unicode", method="xml")
#     print(xml_str)
    ret = []
    for v in record.findall("PubmedArticle"):
        pubid = v.find("MedlineCitation/PMID").text
        article = v.find("MedlineCitation/Article")
        title = article.find("ArticleTitle").text
        node = article.find("Abstract/AbstractText")
        abstract = node.text if node is not None else ""
        node = v.find("PubmedData/ArticleIdList")
        doi = parserArticleidlist(node).get("doi")
        refers = v.find("PubmedData/ReferenceList")
        cites = []
        if refers:
            for r in refers.findall("Reference"):
                node = r.find("ArticleIdList")
                articleIdList = parserArticleidlist(node)
                citation = ""
                node = r.find("Citation")
                if node is not None: # may be False in leaf
                    citation = node.text
                cite = {"articleIdList":articleIdList, "citation":citation}
                cites.append(cite)
        content = {"pubid":pubid, "title":title, "abstract":abstract, "doi":doi, "cites":cites}
        ret.append(content)
    return ret


def fetchInfo(lst, trg):
    lst = list(set([v.strip() for v in lst if v.strip()]))
    bar = tqdm.tqdm(total=len(lst))
    with jsonlines.open(trg, "w") as wp:
        for i in range(0, len(lst), 100):
            if i<28200:
                bar.update(100)
                continue
            try:
                ret = fetch(lst[i:i+100])
                for content in ret:
                    wp.write(content)
            except:
                traceback.print_exc()
                pass
            bar.update(100)

# fetch(["37375092"])

In [29]:
def paperInfo(src, trg):
    lst = open(src).readlines()
    fetchInfo(lst, trg)


def getPaperReferIds(src, trg):
    origin, refers = set(), set()
    with jsonlines.open(src) as fp:
        for v in fp:
            origin.add(v["pubid"])
            for e in v["cites"]:
                if e["articleIdList"] and "pubmed" in e["articleIdList"]:
                    refers.add(e["articleIdList"]["pubmed"])
    refers = refers-origin
    fetchInfo(refers, trg)
    
tlp = "/Volumes/PortableSSD/projects/EnvelopProtein/llm_fusion/{}"
src = tlp.format("pubmed_id.txt")
papers_info = tlp.format("papers.json")
papers_refer = tlp.format("papers_refer1.json")
# paperInfo(src, papers_info)
getPaperReferIds(papers_info, papers_refer)

 75%|████████████████████████████████████████████████████████████████████████████████████▍                           | 28200/37405 [38:10<12:27, 12.31it/s]
37500it [11:27, 54.58it/s]                                                                                                                                 
