# Merge 2020 and 2021 Results

In [1]:
import json
import os
import re
import sys
import tempfile
from pathlib import Path, PurePath
from pprint import pprint

import numpy as np
import pandas as pd
import requests
import requests_cache

In [2]:
%load_ext sql

In [3]:
requests_cache.install_cache("pfocr_cache")

## Import PFOCR 2020 Results

In [4]:
from functools import partial

import rpy2.robjects as ro
from rpy2.ipython import html
from rpy2.robjects import default_converter, pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.lib.dplyr import DataFrame
from rpy2.robjects.packages import importr

html.html_rdataframe = partial(html.html_rdataframe, table_class="docutils")

In [5]:
pandas2ri.activate()
base = importr("base")
readRDS = ro.r["readRDS"]
saveRDS = ro.r["saveRDS"]



In [6]:
def rds2pandas(rds_path):
    r_df = readRDS(str(rds_path))
    with localconverter(ro.default_converter + pandas2ri.converter):
        pandas_df = ro.conversion.rpy2py(r_df)
    return pandas_df

In [7]:
def pandas2rds(pandas_df, rds_path):
    with localconverter(default_converter + pandas2ri.converter) as cv:
        r_df = DataFrame(pandas_df)

    saveRDS(r_df, str(rds_path))

# Get OA PMC data

Get the OA PMC data in XML format (only run this during the off-hours).

[Docs](https://www.ncbi.nlm.nih.gov/pmc/tools/oai/)

In [41]:
data_dir = Path(
    "~/Dropbox (Gladstone)/Documents/pathway-ocr/20210515/"
).expanduser()

In [None]:
pfocr_figures_df = rds2pandas(data_dir.joinpath("pfocr_figures_20210515.rds"))

In [None]:
import time

tool = "pfocr"
email = "anders.riutta@gladstone.ucsf.edu"

for pmc_id in list(set(pfocr_figures_df["pmc_id"].to_list())):
    deprefixed_pmc_id = pmc_id[3:]

    xml_path = images_dir.joinpath(f"{pmc_id}.xml")

    if xml_path.exists():
        continue

    pmc_url = "&".join(
        [
            "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord",
            f"identifier=oai:pubmedcentral.nih.gov:{deprefixed_pmc_id}",
            "metadataPrefix=pmc",
            # TODO: why am I getting an error when I use any of the following?
            # f"api_key={NCBI_API_KEY}",
            # f"tool={tool}",
            # f"email={requests.utils.quote(email)}",
        ]
    )
    with open(xml_path, "wb") as f:
        with requests.get(pmc_url, stream=True) as r:
            for chunk in r.iter_content(chunk_size=128):
                f.write(chunk)
            f.seek(0)
    time.sleep(0.333)

The file PMC8012676.xml is interesting in that it's a good example of being able to detect a composite figure from the text. Also, there are many good examples for looking at the relationships between the text, the citations and the figures.