<a href="https://colab.research.google.com/github/tnewbern/academic-colab/blob/colab/notebooks/efetch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SelfStudy Labs:Explorer Import — E-Utilities

## GitHub Login

In [0]:
%%shell
read -rp "Username: " USERNAME
read -srp "Password: " PASSWORD; echo

git config --global pull.ff only
git config --global credential.helper store
git credential approve <<EOF
protocol=https
host=github.com
username=$USERNAME
password=$PASSWORD
EOF

## E-Search

With E-Search, you can use any of the search terms, fields, and filters that are allowed when doing a search from the PMC home page. See [Searching PMC](https://www.ncbi.nlm.nih.gov/books/NBK3825/#_pmchelp_Searching_PMC_) in the [PMC Help Book](https://www.ncbi.nlm.nih.gov/books/NBK3825/) for more information.


In [0]:
#@title Retrieve PMC article identifiers (PMCIDs) from a search
#@markdown #### Search Term
term = "2019-nCoV OR 2019nCoV OR COVID-19 OR SARS-CoV-2 OR ((wuhan AND coronavirus) AND 2019/12[PDAT]:2030[PDAT]) AND open access[filter]" #@param {type:"string"}
#@markdown #### Max Number of Articles
retmax = 20 #@param {type:"number"}

from lxml import etree
import requests

esearch = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi', params=dict(
    db='pmc',
    term=term,
    usehistory='y',
    sort='relevance',
    retmax=retmax,
    retstart=0,
))
esearch.raise_for_status()
esearchresult = etree.fromstring(esearch.content)
error = esearchresult.find('ERROR')
if etree.iselement(error):
  raise ValueError(error.text)
for error in esearchresult.findall('ErrorList/'):
  print('{}: {}'.format(error.tag, error.text))
for warning in esearchresult.findall('WarningList/'):
  print('{}: {}'.format(warning.tag, warning.text))

uidlist = [f"PMC{uid.text}" for uid in esearchresult.findall('IdList/Id')]
with open('pmc_result.txt', 'w') as file:
  file.write("\n".join(uidlist))

In [0]:
uidlist

In [0]:
#@title Configure Repo { display-mode: "form" }
REPOSITORY = "" #@param {type:"string"}
SITEDIR = "explorer" #@param {type:"string"}
PUBDIR = "publication" #@param {type:"string"}
BRANCH = "import-colab" #@param {type:"string"}

if not REPOSITORY:
  raise ValueError("You must specify a site respository")
%env REPOSITORY $REPOSITORY
%env SITEDIR $SITEDIR
%env PUBDIR $PUBDIR
%env BRANCH $BRANCH

## Clone Site

In [0]:
%%shell
set -u
echo "git clone --branch $BRANCH $REPOSITORY $SITEDIR"
read -rp "Username: " USERNAME
read -srp "Password: " PASSWORD; echo

git config --global pull.ff only
git config --global credential.helper store
git credential approve <<EOF
url=$REPOSITORY
username=$USERNAME
password=$PASSWORD
EOF

git clone --no-checkout $REPOSITORY $SITEDIR && cd $SITEDIR
git checkout $BRANCH 2>/dev/null || git checkout -b $BRANCH

## Create Citations

In [0]:
from lxml import etree
from pathlib import Path
import requests

esearch = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi', params=dict(
    db='pmc',
    term=(
        '2019-nCoV OR 2019nCoV OR COVID-19 OR SARS-CoV-2'
        ' OR ((wuhan AND coronavirus) AND 2019/12[PDAT]:2030[PDAT])'
        ' AND open access[filter]'
    ),
    usehistory='y',
    sort='relevance',
    retmax=100,
    retstart=0,
))
esearch.raise_for_status()
esearchresult = etree.fromstring(esearch.content)
error = esearchresult.find('ERROR')
if etree.iselement(error):
  raise ValueError(error.text)
for error in esearchresult.findall('ErrorList/'):
  print('{}: {}'.format(error.tag, error.text))
for warning in esearchresult.findall('WarningList/'):
  print('{}: {}'.format(warning.tag, warning.text))
uidlist = esearchresult.find('IdList')

efetch = requests.get('https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi', params=dict(
    db='pmc',
    WebEnv=esearchresult.findtext('WebEnv'),
    query_key=esearchresult.findtext('QueryKey'),
    retmax=esearchresult.findtext('RetMax'),
    retstart=esearchresult.findtext('RetStart'),
))
efetch.raise_for_status()
efetchresult = etree.fromstring(efetch.content)
error = efetchresult.find('ERROR')
if etree.iselement(error):
  raise ValueError(error.text)
articleset = efetchresult

jats_conversion = Path('jats-conversion')
if not jats_conversion.exists():
  !git clone https://github.com/tnewbern/jats-conversion.git


jats_to_bibtex = etree.XSLT(etree.parse('jats-conversion/src/data/xsl/jats-to-bibtex.xsl'))
with open('citations.bib', 'wb') as file:
  for article in articleset:
    jats_to_bibtex(article).write_output(file)
    file.write('\n\n'.encode('utf-8'))
    file.flush()

## Academic Import

In [0]:
%pip install -U academic

In [0]:
%%shell
cd $SITEDIR
academic import --bibtex ../citations.bib --overwrite --verbose --publication-dir $PUBDIR

# Push Changes

In [0]:
%%shell
git -C $SITEDIR status

In [0]:
%%shell
if [ -z "$(git config user.email)" ]; then
  read -rp "Email: " EMAIL
  git config --global user.email "$EMAIL"
fi
cd $SITEDIR
git add content/$PUBDIR/
git commit -m "Import Publication Content"
git pull --rebase
git push -u origin $BRANCH