# PubTator
Send OCR data through PubTator to extract chemicals and diseases. In the `OCR` section below, you'll need to specify which type to extract.

In [1]:
import datetime
import json
import re
import sys
import time
from pathlib import Path

import numpy as np
import pandas as pd
import requests
from unidecode import unidecode

In [2]:
from functools import partial

import rpy2.robjects as ro
from rpy2.ipython import html
from rpy2.robjects import default_converter, pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.lib.dplyr import DataFrame
from rpy2.robjects.packages import importr

html.html_rdataframe = partial(html.html_rdataframe, table_class="docutils")

pandas2ri.activate()
base = importr("base")
readRDS = ro.r["readRDS"]
saveRDS = ro.r["saveRDS"]


def rds2pandas(rds_path):
    r_df = readRDS(str(rds_path))
    with localconverter(ro.default_converter + pandas2ri.converter):
        pandas_df = ro.conversion.rpy2py(r_df)
    return pandas_df


def pandas2rds(pandas_df, rds_path):
    with localconverter(default_converter + pandas2ri.converter) as cv:
        r_df = DataFrame(pandas_df)

    saveRDS(r_df, str(rds_path))



## batch_dir
This is the directory that contains data from one run of the PFOCR pipeline, e.g., our 20200224 run. It is intended to contain data like AutoML, OCR, images, genes, etc.

Change the cell below to match the local path on your machine for the directory you want to work with.

In [3]:
batch_dir = Path(
    "/home/ariutta/Dropbox (Gladstone)/Documents/pfocr_pipeline/20200224"
)

## Load pfocr automl and ocr data

### figures
This figures data comes from AutoML classification of images as `pathway` vs. `other` (and some corrections from manual curation in the case of the 20200224 data).

In [4]:
figures_rds_path = batch_dir.joinpath("pfocr_figures.rds")

figures_df = rds2pandas(figures_rds_path).rename(
    columns={
        "figid": "pfocr_id",
        "pmcid": "pmc_id",
        "filename": "figure_filename",
        "number": "figure_number",
        "pmc_ranked_result_index": "pmc_search_index",
        "figtitle": "figure_title",
        "papertitle": "paper_title",
        "caption": "figure_caption",
        "figlink": "relative_figure_page_url",
        "reftext": "reference",
        "year": "publication_year",
    }
)
figures_df["paper_url"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/" + figures_df["pmc_id"]
)

figures_df["figure_page_url"] = (
    "https://www.ncbi.nlm.nih.gov" + figures_df["relative_figure_page_url"]
)

figures_df["figure_thumbnail_url"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/"
    + figures_df["pmc_id"]
    + "/bin/"
    + figures_df["figure_filename"]
)

figures_df.drop(
    columns=[
        "figure_filename",
        "relative_figure_page_url",
    ],
    inplace=True,
)

figures_df

Unnamed: 0,pfocr_id,pmc_id,publication_year,pathway_score,pmc_search_index,source_f,type.man,automl_index,reference,paper_title,figure_title,figure_number,figure_caption,organism,paper_url,figure_page_url,figure_thumbnail_url
1,PMC5653847__41598_2017_14124_Fig8_HTML.jpg,PMC5653847,2017,0.968270,133303,../data/images/PMC5653847__41598_2017_14124_Fi...,,3012,"Céline Barthelemy, et al. Sci Rep. 2017;7:13816.",FTY720-induced endocytosis of yeast and human ...,Model of FTY720-induced transporter endocytosi...,Figure 8,Model of FTY720-induced transporter endocytosi...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
2,PMC4187043__zh20191474070013.jpg,PMC4187043,2014,0.965793,79929,../data/images/PMC4187043__zh20191474070013.jpg,,4323,"Yuan Wei, et al. Am J Physiol Renal Physiol. 2...",Angiotensin II type 2 receptor regulates ROMK-...,Stimulatory effect of ANG II on ROMK channel a...,Fig. 13,Proposed signaling pathway by which the stimul...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
3,PMC5746550__rsob-7-170228-g1.jpg,PMC5746550,2017,0.962470,98034,../data/images/PMC5746550__rsob-7-170228-g1.jpg,,6334,"Georgia R. Frost, et al. Open Biol. 2017 Dec;7...",The role of astrocytes in amyloid production a...,AB production,Figure 1,Aβ production. In the amyloidogenic pathway (r...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
4,PMC4211692__pone.0110875.g008.jpg,PMC4211692,2014,0.966721,142401,../data/images/PMC4211692__pone.0110875.g008.jpg,,3808,"Enida Gjoni, et al. PLoS One. 2014;9(10):e110875.",Glucolipotoxicity Impairs Ceramide Flow from t...,Glucolipotoxicity Impairs Ceramide Flow from t...,Figure 8,Glucolipotoxicity impairs CERT- and vesicular-...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
5,PMC2588433__nihms78212f8.jpg,PMC2588433,,0.966758,67398,../data/images/PMC2588433__nihms78212f8.jpg,,3790,"Amanda L. Lewis, et al. J Biol Chem. ;282(38):...",NeuA sialic acid O-acetylesterase activity mod...,NeuA sialic acid O-acetylesterase activity mod...,Figure 8,Bacterial Sia biosynthesis can be divided into...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64639,PMC4216988__zh20221474360006.jpg,PMC4216988,2014,0.143076,108774,../data/images/PMC4216988__zh20221474360006.jpg,pathway,77324,"Marcelo D. Carattino, et al. Am J Physiol Rena...",Prostasin interacts with the epithelial Na+ ch...,Hypothetical mechanism of activation of ENaC b...,Fig. 6,Hypothetical mechanism of activation of ENaC b...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
64640,PMC2873070__nihms128887f5.jpg,PMC2873070,,0.127176,143547,../data/images/PMC2873070__nihms128887f5.jpg,pathway,78813,"Hua Cheng, et al. Neurobiol Aging. ;31(7):1188...",Apolipoprotein E mediates sulfatide depletion ...,A schematic diagram of a proposed working mode...,Scheme 1,A schematic diagram of a proposed working mode...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
64641,PMC3651446__pnas.1220523110fig06.jpg,PMC3651446,2013,0.055546,159643,../data/images/PMC3651446__pnas.1220523110fig0...,pathway,114977,"Jiun-Ming Wu, et al. Proc Natl Acad Sci U S A....",Aurora kinase inhibitors reveal mechanisms of ...,Models for nucleation of centrosomal and kinet...,Fig. 6,Models for nucleation of centrosomal and kinet...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
64642,PMC6770832__cancers-11-01236-g005.jpg,PMC6770832,2019,0.140041,618,../data/images/PMC6770832__cancers-11-01236-g0...,pathway,77606,"Carmel Mothersill, et al. Cancers (Basel). 201...",Relevance of Non-Targeted Effects for Radiothe...,Simplified TGFB pathway leading to p21 expression,Figure 5,A simplified TGFβ pathway leading to p21 expre...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...


### OCR

In [5]:
args = sys.argv[1:]

supported_concepts = set(["chemical", "disease"])

if (not args) or (not args[0] in supported_concepts):
    if args[0] == "-f":
        print("It appears this is running in a notebook.")

    print(f"Please specify a concept {list(supported_concepts)}:")
    concept = input()
else:
    concept = args[0]

if not concept in supported_concepts:
    raise Exception(
        f"""concept '{concept}' is not supported.
Please choose from supported concepts: {list(supported_concepts)}
"""
    )

pubtator_response_dir = batch_dir.joinpath(f"pubtator_{concept}")
if not pubtator_response_dir.exists():
    pubtator_response_dir.mkdir()

pfocr_ids = set(figures_df["pfocr_id"])

processesed_pfocr_ids = set()
for p in pubtator_response_dir.glob("*.ndjson"):
    pfocr_id = str(p.with_suffix(".jpg").name)
    processesed_pfocr_ids.add(pfocr_id)

remaining_pfocr_ids = pfocr_ids - processesed_pfocr_ids

ocr_text_by_pfocr_id = dict()
for p in batch_dir.joinpath("gcv_ocr").glob("*.json"):
    pfocr_id = str(p.with_suffix(".jpg").name)
    if pfocr_id in remaining_pfocr_ids:
        with p.open("r") as f:
            gcv_ocr_data = json.load(f)
            if not gcv_ocr_data:
                print(f"No OCR data for {pfocr_id}")
                continue
            ocr_text = gcv_ocr_data[0]["description"]
            if ocr_text:
                ocr_text_by_pfocr_id[pfocr_id] = ocr_text

print(f"pfocr_ids to send to pubtator: {len(ocr_text_by_pfocr_id.keys())}")

It appears this is running in a notebook.
Please specify a concept ['disease', 'chemical']:


 chemical


No OCR data for PMC2777340__jbt0050900230005.jpg
No OCR data for PMC5059113__ganc-07-229-g002.jpg
No OCR data for PMC6797170__pone.0223840.g006.jpg
No OCR data for PMC3981025__OR-28-02-0567-g00.jpg
No OCR data for PMC5344263__biomedicines-04-00014-g001.jpg
No OCR data for PMC5799077__1349-7235-57-0153-g003.jpg
No OCR data for PMC3615581__zh70061338130002.jpg
No OCR data for PMC3472464__zh70191237630006.jpg
No OCR data for PMC3521217__1471-2164-13-S7-S25-4.jpg
No OCR data for PMC4255813__phy2-2-e12206-g4.jpg
No OCR data for PMC3928643__1475-2875-12-392-5.jpg
No OCR data for PMC3840597__1471-2407-13-549-4.jpg
No OCR data for PMC2698794__JOBOJOS91S40076fig4.jpg
No OCR data for PMC4531430__12967_2015_609_Fig4_HTML.jpg
No OCR data for PMC5385652__ajcr0007-0688-f4.jpg
No OCR data for PMC3024863__1471-2105-11-S11-S5-5.jpg
No OCR data for PMC2872667__pone.0010692.g002.jpg
No OCR data for PMC6727561__12885_2019_6052_Fig4_HTML.jpg
No OCR data for PMC3712543__fonc-03-00184-g006.jpg
No OCR data fo

## Submit PubTator Request(s)
If the process fails before finishing, you may need to re-run this. It is intended to only run the figures that haven't been already processed.

In [6]:
max_article_length = 200e3

# chr(31) is for the ASCII field separator character
# https://en.wikipedia.org/wiki/C0_and_C1_control_codes#Field_separators
separator = " " + chr(31) + " "

open_paren_re = re.compile("\(")
close_paren_re = re.compile("\)")
side_metabolite_re = re.compile("[HCONSP]|[^a-z]", re.I)

title_re = re.compile("(^.+?)\|(t)\|(.*)")
# index_re = re.compile("(^.+?)(.+)")

error_message_codes_by_error_message = {
    "[Warning] : The Session number does not exist.": "nonexistent_session",
    '{"detail": "We have trouble processing your query"}': "trouble_processing",
}

abstract_re = re.compile("(.+?)\|(a)\|(.*)")
space_only_re = re.compile("^\s*$")

denotation_re = re.compile("(.+?)\t(.+?)\t(.+?)\t(.+?)\t(.+)")


def create_pubtator_session(request_body, bioconcept):
    # submit request
    r = requests.post(
        f"https://www.ncbi.nlm.nih.gov/research/pubtator-api/annotations/annotate/submit/{bioconcept}",
        data=request_body.encode("utf-8"),
    )
    result = {"status_code": r.status_code}
    if r.status_code != 200:
        print("[Error]: HTTP code " + str(r.status_code))
    else:
        session_number = r.text
        result["session_number"] = session_number
        print(
            "Thanks for your submission. The session number is: "
            + session_number
            + "\n"
        )

    return result


# Collect responses to the requests made earlier to the PubTator API:
def retrieve_pubtator_session_results(session_number, iteration=0, delay=20):
    res = requests.get(
        f"https://www.ncbi.nlm.nih.gov/research/pubtator-api/annotations/annotate/retrieve/{session_number}"
    )

    res_text = res.text
    if res_text == "[Warning] : The Result is not ready.\n":
        time.sleep(delay)
        return retrieve_pubtator_session_results(
            session_number, iteration=iteration + 1, delay=delay * 1.5
        )
    elif res_text == '{"detail": "We have trouble processing your query"}':
        raise Warning(
            f"Warning: PubTator had trouble processing query for {session_number}"
        )
    else:
        return res_text


sessions = []
# We send batched requests to PubTator, meaning we send multiple figures per
# request. We accumulate the figure ocr text onto the request body until the
# batch reaches the size limit or we run out of figures. If the batch reaches
# the size limit, we send that request and start a new one.
batched_request_body = ""

for pfocr_id, figure_ocr_text in ocr_text_by_pfocr_id.items():
    if figure_ocr_text:
        text_chunks = []
        for text_chunk in unidecode(
            figure_ocr_text.replace("\n", separator)
            .replace("\t", separator)
            .replace("  ", separator)
            .replace("|", separator)
        ).split(separator):
            # Ignore words like this:
            # H
            # CH20
            # 721-2
            # --..>
            if len(side_metabolite_re.sub("", text_chunk)) == 0:
                continue

            open_paren_count = len(open_paren_re.findall(text_chunk))
            close_paren_count = len(close_paren_re.findall(text_chunk))
            if open_paren_count > close_paren_count:
                # if the text_chunk is "(Pyruvate", we need to balance the parens, because
                # the PubTator API won't return anything if parens are unbalanced.
                text_chunk += (
                    " " + ")" * (open_paren_count - close_paren_count) + " "
                )
                # TODO: is there any way we could get a false positive by adding close parens?
                # should we add an extra open paren first to avoid false positives?
                # text_chunk += (" (" + ")" * (open_paren_count - close_paren_count + 1) + " ")
            text_chunks.append(text_chunk)

        text = separator.join(text_chunks)

        # Note: the extra empty line at the bottom is required
        current_figure_request_body = f"""{pfocr_id}|t|{text}
{pfocr_id}|a|

"""

        if (
            len(batched_request_body + current_figure_request_body)
            > max_article_length
        ):
            sessions.append(
                create_pubtator_session(batched_request_body, concept)
            )
            batched_request_body = ""

        batched_request_body += current_figure_request_body

# if the batched_request_body never reached max_article_length or
# if the final figure didn't fit into the last session,
# we need to process it here
if len(batched_request_body) > 0:
    sessions.append(create_pubtator_session(batched_request_body, concept))
    batched_request_body = ""

# Estimating total time required until results available
# (copied from PubTator docs)
initial_delay = len(sessions) * 200 + 250
preprocessing_time = 200
processing_time = max_article_length / 800
estimated_total_time = initial_delay + preprocessing_time + processing_time
ready = datetime.datetime.now() + datetime.timedelta(
    seconds=estimated_total_time
)
print(
    f"Estimated total time to complete: {str(int(estimated_total_time))} seconds"
)
print(f"Estimated time when complete: {ready.strftime('%c')}")
max_wait_seconds = 60 * 60 * 2
wait_seconds = min(
    (ready - datetime.datetime.now()).total_seconds(), max_wait_seconds
)
if wait_seconds > 0:
    time.sleep(wait_seconds)

denotations_by_pfocr_id = {}
errors = []

for session in sessions:
    pubtator_response = retrieve_pubtator_session_results(
        session["session_number"]
    )

    with open(
        pubtator_response_dir.joinpath(f"{session['session_number']}.PubTator"),
        "w",
    ) as f:
        f.write(pubtator_response)

    pfocr_id = None
    denotations = []

    lines = pubtator_response.splitlines()
    for i, line in enumerate(lines):
        context = "\n".join(lines[max(0, i - 1) : i + 2])
        # We determine whether this line of the response contains an error
        # by message running through the all the error messages we've
        # previously observed. If one of them is in this line, we excise
        # the error from the line and keep going.
        for (
            error_message,
            error_message_code,
        ) in error_message_codes_by_error_message.items():
            error_message_len = len(error_message)
            if line[0:error_message_len] == error_message:
                errors.append(error_message_code)
                line = line[error_message_len:]

        # an empty line indicates the end of lines for current figure
        if space_only_re.match(line):
            if len(denotations) > 0:
                print(f"no hits for {pfocr_id}\n")

            # reset
            pfocr_id = None
            denotations = []

            continue

        title_match = title_re.match(line)
        if title_match:
            if pfocr_id or len(denotations) > 0:
                raise Exception(
                    f"""pfocr_id and/or denotations weren't reset before title line!
Was there no empty line before the title line?
{context}
"""
                )

            pfocr_id = title_match.group(1)
            denotations_by_pfocr_id[pfocr_id] = denotations

            continue

        abstract_match = abstract_re.match(line)
        if abstract_match:
            source_id = abstract_match.group(1)
            if len(denotations) > 0:
                raise Exception(
                    f"""denotations weren't reset before abstract line!
Was there no title line before the abstract line?
{context}
"""
                )
            elif (not source_id) or (source_id != pfocr_id):
                print(
                    f"""source_id {source_id} != pfocr_id {pfocr_id}!
Was there no title line before the abstract line?
{context}
"""
                )

                # reset
                pfocr_id = None
                denotations = []

                continue

                # raise Exception(
                #    f"source_id {source_id} should match pfocr_id {pfocr_id}! Was the expected preceeding title line missing?"
                # )

            continue

        denotation_match = denotation_re.match(line)
        if denotation_match:
            source_id = denotation_match.group(1)
            if (not pfocr_id) or (source_id != pfocr_id):
                raise Exception(
                    f"""source_id {source_id} should match pfocr_id {pfocr_id}!
Was the expected title line or abstract line missing?
{context}
"""
                )

            word = denotation_match.group(4)
            obj = denotation_match.group(5)

            denotation = {"word": word}
            denotations.append(denotation)

            word_type, obj_separator, word_identifier = obj.partition("\t")

            if word_type:
                denotation["type"] = word_type
            else:
                print(
                    f"{pfocr_id} missing type. expected 'type\\tidentifier' but got '{obj}'"
                )

            if word_identifier:
                denotation["identifier"] = word_identifier
            else:
                print(
                    f"{pfocr_id} missing identifier. expected 'type\\tidentifier' but got '{obj}'"
                )

            continue

        raise Exception(
            f"""Error: Unknown line type.
{context}
"""
        )

for pfocr_id, denotations in denotations_by_pfocr_id.items():
    with open(
        pubtator_response_dir.joinpath(pfocr_id).with_suffix(".ndjson"), "w"
    ) as f:
        for denotation in denotations:
            f.write(json.dumps(denotation))
            f.write("\n")

Thanks for your submission. The session number is: 6594-3037-2730-3034

Estimated total time to complete: 900 seconds
Estimated time when complete: Thu 28 Apr 2022 04:36:17 PM 
source_id PMC4980668__srep31509-f8.jpg != pfocr_id None!
Was there no title line before the abstract line?
PMC4980668__srep31509-f8.jpg|a|


source_id PMC5497948__pone.0178574.g008.jpg != pfocr_id None!
Was there no title line before the abstract line?

PMC5497948__pone.0178574.g008.jpg|a|


source_id PMC4367964__nihms669495f5.jpg != pfocr_id None!
Was there no title line before the abstract line?

PMC4367964__nihms669495f5.jpg|a|


source_id PMC5853609__erx12403.jpg != pfocr_id None!
Was there no title line before the abstract line?

PMC5853609__erx12403.jpg|a|




In [7]:
pubtator_records = []
for p in pubtator_response_dir.glob("*.ndjson"):
    pfocr_id = str(p.with_suffix(".jpg").name)
    if pfocr_id in pfocr_ids:
        with p.open("r") as f:
            for line in f.readlines():
                if line.strip():
                    denotation = json.loads(line)
                    denotation["pfocr_id"] = pfocr_id
                    pubtator_records.append(denotation)

pubtator_df = pd.DataFrame.from_records(pubtator_records)
pandas2rds(pubtator_df, batch_dir.joinpath(f"pfocr_{concept}s.rds"))
pubtator_df

Unnamed: 0,word,type,identifier,pfocr_id
0,MDP CO,Chemical,MESH:C041058,PMC3624162__emss-52733-f0003.jpg
1,glucose farnesol,Chemical,MESH:D005204,PMC3624162__emss-52733-f0003.jpg
2,hydroxyurea A,Chemical,MESH:D006918,PMC3624162__emss-52733-f0003.jpg
3,4-coumaroyl,Chemical,MESH:C058644,PMC3229458__1471-2229-11-155-1.jpg
4,L-phenylalanine,Chemical,MESH:C119108,PMC3229458__1471-2229-11-155-1.jpg
...,...,...,...,...
238245,acetolactate,Chemical,MESH:C006359,PMC6267845__12934_2018_1038_Fig1_HTML.jpg
238246,succinate,Chemical,MESH:D013386,PMC6267845__12934_2018_1038_Fig1_HTML.jpg
238247,acetate,Chemical,MESH:D000085,PMC6267845__12934_2018_1038_Fig1_HTML.jpg
238248,acetoin,Chemical,MESH:D000093,PMC6267845__12934_2018_1038_Fig1_HTML.jpg


## Summary Stats

In [8]:
print(f"summary stats for {concept}")
print(f"unique IDs: {len(pubtator_df[['identifier']].drop_duplicates())}")
print(f"unique words: {len(pubtator_df[['word']].drop_duplicates())}")
# includes cases where the same ID shows up multiple times in one figure:
print(f"total ID instances: {len(pubtator_df[['pfocr_id', 'identifier']])}")
# w/out multiple of the same in a figure:
print(
    f"total ID instances (dedup w/in fig): {len(pubtator_df[['pfocr_id', 'identifier']].drop_duplicates())}"
)
print(f"total word instances: {len(pubtator_df[['pfocr_id', 'word']])}")

summary stats for chemical
unique IDs: 12785
unique words: 57126
total ID instances: 238250
total ID instances (dedup w/in fig): 207685
total word instances: 238250
