# fetch pathway figure PMCIDs from NCBI

NOTE: query qualifier for figure captions [CAPT] is clearly broken and only hits on a fraction of caption titles.
the `imagesdocsum` report type does a better job of actually searching captions, e.g.:
- https://www.ncbi.nlm.nih.gov/pmc/?term=(signaling+pathway)+AND+(2019+[pdat])&report=imagesdocsum&dispmax=100 
(11349 hits with "signaling pathway" in every caption title or caption body)
- https://www.ncbi.nlm.nih.gov/pmc/?term=(signaling+pathway[CAPT])+AND+(2019+[pdat])&report=imagesdocsum&dispmax=100
(244 hits with "signaling pathway" ONLY in caption titles)
- https://www.ncbi.nlm.nih.gov/pmc/?term=(signaling+pathway[CAPT])+AND+(2019+[pdat])
(2775 hits when "report=imagesdocsum" is excluded)

NOTE: the `imagesdocsum` report is not supported by NCBI's eutils, so we'll have to go with HTML scraping. 
The pagination of pmc output is not apparent, however...

## Example queries for what is possible
- https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=asthma[mesh]+AND+leukotrienes[mesh]+AND+2009[pdat]&usehistory=y&retmax=500&retStart=0
- https://www.ncbi.nlm.nih.gov/pmc/?term=signaling+pathway+AND+2018+[pdat]&report=imagesdocsum&dispmax=100
- https://www.ncbi.nlm.nih.gov/pmc/?term=((((((((((signaling+pathway)+OR+regulatory+pathway)+OR+disease+pathway)+OR+drug+pathway)+OR+metabolic+pathway)+OR+biosynthetic+pathway)+OR+synthesis+pathway)+OR+cancer+pathway)+OR+response+pathway)+OR+cycle+pathway)+AND+(\%222019/01/01\%22[PUBDATE]+%3A+\%223000\%22[PUBDATE])&report=imagesdocsum&dispmax=100#

## Network query:
- https://www.ncbi.nlm.nih.gov/pmc/?term=((network)+OR+PPI)+AND+(%222019/01/01%22[PUBDATE]+%3A+%223000%22[PUBDATE])&report=imagesdocsum&dispmax=100

In [1]:
################## QUERY BUILDER

## Pathway types:
query.terms <- c(
  "signaling+pathway", "signalling+pathway", "regulatory+pathway",
  "disease+pathway", "drug+pathway", "metabolic+pathway",
  "biosynthetic+pathway", "synthesis+pathway", "cancer+pathway",
  "response+pathway", "cycle+pathway"
)

query.date.from <- "2018/01/01"
query.date.to <- "3000/01/01"

term <- paste0(
  "term=", paste(rep("(", length(query.terms)), collapse = ""), paste(lapply(
    query.terms,
    function(x) {
      paste0(x, ")")
    }
  ), collapse = "+OR+"), "+AND+(\"", query.date.from, "\"[PUBDATE]+%3A+\"", query.date.to,
  "\"[PUBDATE])"
)

query.url <- paste0(
  "https://www.ncbi.nlm.nih.gov/pmc/?", term, "&report=imagesdocsum",
  "&dispmax=100"
)
query.url

In [2]:
################ PMC SCRAPER
library(conflicted)
library(processx)
library(RSelenium)
library(rvest)
library(xml2)
library(tidyverse)
conflict_prefer("filter", "dplyr")
conflict_prefer("select", "dplyr")
conflict_prefer("mutate", "dplyr")

Loading required package: xml2

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

[conflicted] Will prefer [34mdplyr::filter[39m over any other package

[conflicted] Will prefer [34mdplyr::select[39m over any other package

[conflicted] Will prefer [34mdplyr::mutate[39m over any other package



In [3]:
# set dir for saving results as tsv
data_dir_base <- "../data/imagesdocsum_pathway_queries/"
# data_dir_suffix <- "20210429"
data_dir_suffix <- "20210513"
data_dir <- paste0(data_dir_base, data_dir_suffix)

if (basename(getwd()) != "notebooks") {
  # update this to whatever is appropriate for your system
  setwd("/home/ariutta/Documents/pathway-figure-ocr/notebooks")
}

if ("pfocr_fetch.R.ipynb" %in% list.files()) {
  if (!dir.exists(data_dir)) {
    dir.create(data_dir, showWarnings = TRUE, recursive = TRUE)
  }
  setwd(data_dir)
} else {
  print("Error: wrong directory")
}

getwd()

cat(query.url, file = "query.txt")
write.table(data.frame(
  "figureid", "pmcid", "filename", "fignumber", "figtitle",
  "papertitle", "figcaption", "figlink", "reftext"
),
file = "pmc.df.all.tsv", append = FALSE,
sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE
)

In [4]:
# adapted from https://www.tidyverse.org/blog/2018/09/processx-3.2.0/
start_program <- function(command, args, message, timeout = 5, ...) {
  timeout <- as.difftime(timeout, units = "secs")
  deadline <- Sys.time() + timeout
  px <- process$new(command, args, stdout = "|", stderr = "|", ...)
  print(px)
  while (px$is_alive() && (now <- Sys.time()) < deadline) {
    poll_time <- as.double(deadline - now, units = "secs") * 1000
    px$poll_io(as.integer(poll_time))
    out_lines <- px$read_output_lines()
    err_lines <- px$read_error_lines()
    if (any(grepl(message, c(out_lines, err_lines)))) {
      return(px)
    }
  }

  px$kill()
  stop("Cannot start ", command)
}

In [5]:
# TODO: don't start it if it's already running
proc <- start_program("selenium-server", c("-port", "4445"), "running on port")

PROCESS 'selenium-server', running, pid 22124.


In [6]:
remDr <- remoteDriver(
  remoteServerAddr = "localhost", port = 4445L, browserName = "firefox",
  extraCapabilities = list(
    `moz:firefoxOptions` = list(args = list("--headless"))
  )
)
remDr$open()

[1] "Connecting to remote server"
$acceptInsecureCerts
[1] FALSE

$browserName
[1] "firefox"

$browserVersion
[1] "68.8.0"

$`moz:accessibilityChecks`
[1] FALSE

$`moz:buildID`
[1] "20200613165304"

$`moz:geckodriverVersion`
[1] "0.26.0"

$`moz:headless`
[1] TRUE

$`moz:processID`
[1] 22174

$`moz:profile`
[1] "/run/user/1000/rust_mozprofile5mFuws"

$`moz:shutdownTimeout`
[1] 60000

$`moz:useNonSpecCompliantPointerOrigin`
[1] FALSE

$`moz:webdriverClick`
[1] TRUE

$pageLoadStrategy
[1] "normal"

$platformName
[1] "linux"

$platformVersion
[1] "4.19.116-hardened"

$rotatable
[1] FALSE

$setWindowRect
[1] TRUE

$strictFileInteractability
[1] FALSE

$timeouts
$timeouts$implicit
[1] 0

$timeouts$pageLoad
[1] 300000

$timeouts$script
[1] 30000


$unhandledPromptBehavior
[1] "dismiss and notify"

$webdriver.remote.sessionid
[1] "e040dafc-9472-4c2f-868d-6fa23bcd40f4"

$id
[1] "e040dafc-9472-4c2f-868d-6fa23bcd40f4"



In [7]:
## go to query result
remDr$navigate(query.url)
# confirm you got there remDr$screenshot(display = TRUE)
remDr$getTitle()

In [8]:
## Collect all pages!
df.all <- data.frame(
  figid = character(),
  pmcid = character(),
  filename = character(),
  number = character(),
  figtitle = character(),
  papertitle = character(),
  caption = character(),
  figlink = character(), reftext = character()
)

page.count <- xml2::read_html(remDr$getPageSource()[[1]]) %>%
  rvest::html_nodes(".title_and_pager") %>%
  rvest::html_node(".pagination") %>%
  rvest::html_nodes("a") %>%
  rvest::html_attr("page")
page.count <- as.integer(page.count[4])
page.count

If the following cell gives an error, do the following:
1. set `restarting_where_left_off` to `TRUE`
2. set `start_page` to reflect the last page completed
3. run this cell and all below

In [11]:
restarting_where_left_off <- FALSE
if (restarting_where_left_off) {
  remDr$goBack()
  start_page <- 402
} else {
  start_page <- 1
}
print(start_page)

pages_completed_file <- file(paste0(getwd(), "/", "pages_completed.txt"))

for (i in start_page:page.count) {

  ## Parse page
  page.source <- xml2::read_html(remDr$getPageSource()[[1]])
  filename <- page.source %>%
    rvest::html_nodes(".rprt_img") %>%
    rvest::html_node("img") %>%
    rvest::html_attr("src-large") %>%
    str_match("bin/(.*\\.jpg)") %>%
    as.data.frame() %>%
    select(2) %>%
    as.matrix() %>%
    as.character()
  number <- page.source %>%
    rvest::html_nodes(".rprt_img") %>%
    rvest::html_node("img") %>%
    rvest::html_attr("alt")
  titles <- page.source %>%
    rvest::html_nodes(".rprt_img") %>%
    rvest::html_node(xpath = "..") %>%
    rvest::html_node(".rprt_cont") %>%
    rvest::html_node(".title") %>%
    rvest::html_text() %>%
    str_split("\\s+From: ", simplify = TRUE)
  titles_text <- page.source %>%
    rvest::html_nodes(".rprt_img") %>%
    rvest::html_node(xpath = "..") %>%
    rvest::html_node(".rprt_cont") %>%
    rvest::html_node(".title") %>%
    rvest::html_text()
  # print(titles_text)
  # print(titles)
  papertitle <- titles[, 2] %>% str_trim()
  caption <- page.source %>%
    rvest::html_nodes(".rprt_img") %>%
    rvest::html_node(xpath = "..") %>%
    rvest::html_node(".rprt_cont") %>%
    rvest::html_node(".supp") %>%
    rvest::html_text()
  figlink <- page.source %>%
    rvest::html_nodes(".rprt_img") %>%
    rvest::html_attr("image-link")
  reftext <- page.source %>%
    rvest::html_nodes(".rprt_img") %>%
    rvest::html_node(xpath = "..") %>%
    rvest::html_node(".rprt_cont") %>%
    rvest::html_node(".aux") %>%
    rvest::html_text() %>%
    str_remove(fixed("CitationFull text"))
  pmcid <- page.source %>%
    rvest::html_nodes(".rprt_img") %>%
    rvest::html_node(xpath = "..") %>%
    rvest::html_node(".rprt_cont") %>%
    rvest::html_node(".title") %>%
    rvest::html_node("a") %>%
    rvest::html_attr("href") %>%
    str_match("PMC\\d+") %>%
    as.character()

  ## Extract best figure title from analysis of provided, number, title and caption
  temp.df <- data.frame(n = number, t = titles[, 1], c = caption, stringsAsFactors = FALSE) %>%
    mutate(t = str_trim(str_remove(
      t, fixed(
        as.character(
          if_else(
            number != "",
            number,
            "a string just to suppress the empty search patterns warning message"
          )
        )
      )
    ))) %>%
    mutate(c = if_else(is.na(c), t, c)) %>%
    mutate(t = str_trim(str_remove(
      t,
      "\\.$"
    ))) %>%
    mutate(t = if_else(t == "", c, t)) %>%
    mutate(t = if_else(!is.na(str_match(
      t,
      "\\. .*"
    )), str_remove(t, "\\. .*"), t)) %>%
    mutate(t = str_trim(str_remove(
      t,
      "\\.+$"
    ))) %>%
    mutate(t = str_trim(str_remove(t, "^\\."))) %>%
    mutate(c = str_trim(str_replace(
      c,
      "\\.\\.", "\\."
    ))) %>%
    mutate(n = str_trim(str_replace(n, "\\.$", "")))
  number <- as.character(temp.df[, 1])
  figtitle <- as.character(temp.df[, 2])
  caption <- as.character(temp.df[, 3])

  ## Prepare df and write to R.object and tsv
  df <- data.frame(
    pmcid, filename, number, figtitle, papertitle, caption, figlink,
    reftext
  ) %>%
    mutate(figid = paste(pmcid, filename, sep = "__")) %>%
    select(
      figid,
      pmcid, filename, number, figtitle, papertitle, caption, figlink, reftext
    )

  df.all <- rbind(df.all, df)

  write.table(df,
    file = "pmc.df.all.tsv", append = TRUE, sep = "\t", quote = TRUE,
    col.names = FALSE, row.names = FALSE, fileEncoding = "UTF-8"
  )

  if (i < page.count - 1) {
    next.page.button <- remDr$findElement(using = "xpath", "//*[@class='active page_link next']")
    next.page.button$clickElement()
    # remDr$screenshot(display = TRUE)
  }

  writeLines(c(paste(i, "of", page.count)), pages_completed_file)
  # print(paste(i, "of", page.count))
}

close(pages_completed_file)

In [12]:
## At the end of the day...
df.all <- unique(df.all)
saveRDS(df.all, file = "pmc.df.all.rds")
# df.all <- readRDS('pmc.df.all.rds')

In [13]:
## Close up shop
remDr$closeall()

proc$is_alive()
proc$kill()
proc$is_alive()