# PubMed

## 1. About

- PubMed comprises more than 33 million citations for biomedical literature from MEDLINE, life science journals, and online books. Citations may include links to full text content from PubMed Central and publisher web sites.
- The PMC OA service allows users to discover downloadable resources from the PMC Open Access Subset

- `website`: https://pubmed.ncbi.nlm.nih.gov/
- `paper`: https://www.nature.com/articles/nbt.4267
- `OA Web Service`: https://www.ncbi.nlm.nih.gov/pmc/tools/oa-service/

## 2. PubMed

In [5]:
import os
import time
from Bio import Medline, Entrez
Entrez.email = ""

In [2]:
import os
proxy = 'http://165.225.96.34:10015'
os.environ['http_proxy'] = proxy 
os.environ['HTTP_PROXY'] = proxy
os.environ['https_proxy'] = proxy
os.environ['HTTPS_PROXY'] = proxy

<li><b>input a list of terms, return a list of pmids</b></li>

In [3]:
def term2pmid(terms: list, save_path=None) -> list:
    """输入查询词，访问PubMed数据库，获得PMID列表."""
    # 构建查询语句
    query_terms = '(' + ') OR ('.join(terms) + ')'
    print('[query]: {}'.format(query_terms))
    # 开始检索
    handle0 = Entrez.esearch(db='pubmed', term=query_terms, RetMax=300000000)
    record = Entrez.read(handle0)
    pmids, count = record['IdList'], record['Count']
    # 按升序进行排序
    pmids.sort(key=int)
    # 保存结果
    if save_path:
        with open(save_path, 'w') as f:
            f.write('\n'.join(pmids) + '\n')
        print('[pmid]: {}, [save path]: {}'.format(len(pmids), save_path))
    return pmids

terms = ['COVID-19', 'SARS-CoV-2']
pmids = term2pmid(terms)
print(len(pmids), pmids[:5])

[query]: (COVID-19) OR (SARS-CoV-2)
193903 ['6783405', '14648488', '15631713', '23449231', '23517868']


<li><b>download abstracts with medline format</b></li>

In [6]:
def downloadMedline(pmids:list=None, save_path=None):
    t1 = time.time()
    print("[pmid]: {}, [save path]:{}".format(len(pmids), save_path))
    count = len(pmids)
    batch_size = 10000
    iterations = [[i * batch_size, min((i + 1) * batch_size, count)] for i in range((count-1) // batch_size + 1)]
    # 开始分批次下载
    medlines = list()
    for (start, end) in iterations:
        print('\t[Downloading]: {}-{}'.format(start+1, end))
        handle1 = Entrez.efetch(db='pubmed', id=pmids[start:end], rettype='medline', retmode='text')
        record_medline = Medline.parse(handle1)
        medlines.extend(list(record_medline))
    # 保存
    if save_path:
        with open(save_path, "w") as f:
            f.write(json.dumps(medlines, ensure_ascii=False, indent=4))
        print('\t[saved]:', save_path)
    t2 = time.time()
    print('\t[used time]: {} seconds.'.format(round(t2-t1, 4)))
    return "downloaded!"


terms = ['HBV', 'Hepatitis B Virus']
pmids = term2pmid(terms)
downloadMedline(pmids)

[query]: (HBV) OR (Hepatitis B Virus)
[pmid]: 65715, [save path]:None
	[Downloading]: 1-10000
	[Downloading]: 10001-20000
	[Downloading]: 20001-30000
	[Downloading]: 30001-40000
	[Downloading]: 40001-50000
	[Downloading]: 50001-60000
	[Downloading]: 60001-65715
	[used time]: 658.1524 seconds.


'downloaded!'

<li><b>PMCID - PMID - Manuscript ID - DOI Converter</b></li>

`https://www.ncbi.nlm.nih.gov/pmc/pmctopmid/`

## 3. PMC

In [2]:
import os
import json
import tarfile
import requests
from bs4 import BeautifulSoup

<li><b>Get database information:</b></li>

In [3]:
url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi"
r = requests.get(url)
print(r.text)

<OA><responseDate>2021-11-08 09:24:05</responseDate><request>https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi</request><repositoryName>PubMed Central Open Access FTP Repository</repositoryName><formats><format>tgz</format><format>pdf</format></formats><records><count>3933626</count><count format="tgz">3933623</count><count format="pdf">1020064</count><latest>2021-11-08 05:16:42</latest></records></OA>



<li><b>Get all the records updated on or after a specified date:</b></li>

In [4]:
url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?from=2021-11-08"
r = requests.get(url)

<li><b>Get a record by id:</b></li>

In [5]:
url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC5334499"
r = requests.get(url)
print(r.text)

<OA><responseDate>2021-11-08 09:24:16</responseDate><request id="PMC5334499">https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id=PMC5334499</request><records returned-count="2" total-count="2"><record id="PMC5334499" citation="World J Radiol. 2017 Feb 28; 9(2):27-33" license="CC BY-NC" retracted="no"><link format="tgz" updated="2017-03-17 13:10:45" href="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/8e/71/PMC5334499.tar.gz" /><link format="pdf" updated="2017-03-03 06:05:17" href="ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/8e/71/WJR-9-27.PMC5334499.pdf" /></record></records></OA>



<li><b>download tar with wget</b></li>

`!wget -c ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/27/f6/PMC6517830.tar.gz -O - | tar -xz`

<li><b>download tar with python</b></li>

In [7]:
PMCID = 'PMC5334499'
PMC_info_url = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi?id="+PMCID
r = requests.get(PMC_info_url)
r_text = BeautifulSoup(r.text)
links = r_text.oa.record.findAll('link')
PMC_tar_href, PMC_pdf_href = '', ''
for l in links:
    if l['format']=='tgz':
        PMC_tar_href = l['href']
    if l['format']=='pdf':
        PMC_pdf_href = l['href']
PMC_tar_url = PMC_tar_href.replace('ftp:', 'http:')
PMC_pdf_url = PMC_pdf_href.replace('ftp:', 'http:')
print(PMC_tar_url)
print(PMC_pdf_url)

http://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/8e/71/PMC5334499.tar.gz
http://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_pdf/8e/71/WJR-9-27.PMC5334499.pdf


In [None]:
tar_r = requests.get(PMC_tar_url)
with open(PMCID+".tar.gz", 'wb') as f:
    f.write(tar_r.content)
file = tarfile.open(PMCID+'.tar.gz') 
file.extractall('.') 
file.close() 

<li><b>download pdf with python</b></li>

In [None]:
pdf_r = requests.get(PMC_pdf_url)
with open(PMCID+".pdf", 'wb') as f:
    f.write(pdf_r.content) 