In [2]:
import json
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile

import numpy as np
import pandas as pd
import requests
import requests_cache

In [3]:
requests_cache.install_cache("pfocr_cache")

In [4]:
from functools import partial

import rpy2.robjects as ro
from rpy2.ipython import html
from rpy2.robjects import default_converter, pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.lib.dplyr import DataFrame
from rpy2.robjects.packages import importr

html.html_rdataframe = partial(html.html_rdataframe, table_class="docutils")

In [5]:
pandas2ri.activate()
base = importr("base")
readRDS = ro.r["readRDS"]
saveRDS = ro.r["saveRDS"]

In [6]:
def rds2pandas(rds_path):
    r_df = readRDS(str(rds_path))
    with localconverter(ro.default_converter + pandas2ri.converter):
        pandas_df = ro.conversion.rpy2py(r_df)
    return pandas_df

In [7]:
def pandas2rds(pandas_df, rds_path):
    with localconverter(default_converter + pandas2ri.converter) as cv:
        r_df = DataFrame(pandas_df)

    saveRDS(r_df, str(rds_path))

In [8]:
current_dir = Path("./")

In [9]:
latest_pfocr_data_url = (
    "https://www.dropbox.com/sh/nmm0pddamgmu32m/AACCNdFXJrpQRD-EZOPe73RCa?dl=1"
)

zipfile = ZipFile(BytesIO(requests.get(latest_pfocr_data_url).content))
figures_rds_path = current_dir.joinpath("pfocr_figures_draft.rds")
zipfile.extract(str(figures_rds_path))
genes_rds_path = current_dir.joinpath("pfocr_genes_draft.rds")
zipfile.extract(str(genes_rds_path))
chemicals_rds_path = current_dir.joinpath("pfocr_chemicals_draft.rds")
zipfile.extract(str(chemicals_rds_path))
diseases_rds_path = current_dir.joinpath("pfocr_diseases_draft.rds")
zipfile.extract(str(diseases_rds_path))

'/content/pfocr_genes_draft.rds'

In [10]:
figures_df = rds2pandas(figures_rds_path).rename(
    columns={
        "figid": "pfocr_id",
        "pmcid": "pmc_id",
        "filename": "figure_filename",
        "number": "figure_number",
        "pmc_ranked_result_index": "pmc_search_index",
        "figtitle": "figure_title",
        "papertitle": "paper_title",
        "caption": "figure_caption",
        "figlink": "relative_figure_page_url",
        "reftext": "reference",
        "year": "publication_year",
    }
)
figures_df["paper_url"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/" + figures_df["pmc_id"]
)

figures_df["figure_page_url"] = (
    "https://www.ncbi.nlm.nih.gov" + figures_df["relative_figure_page_url"]
)

figures_df["figure_thumbnail_url"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/"
    + figures_df["pmc_id"]
    + "/bin/"
    + figures_df["figure_filename"]
)

figures_df.drop(
    columns=[
        "figure_filename",
        "relative_figure_page_url",
    ],
    inplace=True,
)

figures_df

Unnamed: 0,pfocr_id,figure_number,figure_caption,pmc_id,paper_title,reference,pmc_search_index,pathway_score,figure_title,organism,publication_year,paper_url,figure_page_url,figure_thumbnail_url
1,PMC5732092__cshperspect-CYT-028522_F2.jpg,Figure 2,Interleukin (IL)-17RA/RC signaling pathways. I...,PMC5732092,Interleukin 17 Family Cytokines: Signaling Mec...,"Leticia Monin, et al. Cold Spring Harb Perspec...",33056,0.900423,Interleukin (IL)-17RA/RC signaling pathways,Homo sapiens,2018,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.govpmc/articles/PMC57...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
2,PMC5793760__cshperspect-TGF-022210_F4.jpg,Figure 4,TGF-β signaling pathways. TGF-βs signal by bin...,PMC5793760,TGF-β Signaling in Control of Cardiovascular F...,"Marie-José Goumans, et al. Cold Spring Harb Pe...",54906,0.932830,TGFB signaling pathways,Homo sapiens,2018,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.govpmc/articles/PMC57...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
3,PMC5793761__cshperspect-TGF-031989_F1.jpg,Figure 1,Bone morphogenetic protein (BMP) signaling pat...,PMC5793761,Bone Morphogenetic Proteins in Vascular Homeos...,"Marie-José Goumans, et al. Cold Spring Harb Pe...",13758,0.966398,Bone morphogenetic protein (BMP) signaling pat...,Homo sapiens,2018,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.govpmc/articles/PMC57...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
4,PMC5830892__cshperspect-CEL-027961_F2.jpg,Figure 2,FERM-binding partners of Crumbs3. Through its ...,PMC5830892,The Crumbs3 Polarity Protein.,Ben Margolis. Cold Spring Harb Perspect Biol. ...,38670,0.693111,FERM-binding partners of Crumbs3,Homo sapiens,2018,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.govpmc/articles/PMC58...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
5,PMC5830900__cshperspect-TGF-031997_F1.jpg,Figure 1,Role of the TGF-β family in mammary gland deve...,PMC5830900,TGF-β Family Signaling in Ductal Differentiati...,"Kaoru Kahata, et al. Cold Spring Harb Perspect...",19123,0.600142,Role of the TGFB family in mammary gland devel...,Homo sapiens,2018,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.govpmc/articles/PMC58...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79945,PMC4216988__zh20221474360006.jpg,Fig. 6,Hypothetical mechanism of activation of ENaC b...,PMC4216988,Prostasin interacts with the epithelial Na+ ch...,"Marcelo D. Carattino, et al. Am J Physiol Rena...",108774,0.143076,Hypothetical mechanism of activation of ENaC b...,Homo sapiens,2014,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
79946,PMC2873070__nihms128887f5.jpg,Scheme 1,A schematic diagram of a proposed working mode...,PMC2873070,Apolipoprotein E mediates sulfatide depletion ...,"Hua Cheng, et al. Neurobiol Aging. ;31(7):1188...",143547,0.127176,A schematic diagram of a proposed working mode...,Homo sapiens,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
79947,PMC3651446__pnas.1220523110fig06.jpg,Fig. 6,Models for nucleation of centrosomal and kinet...,PMC3651446,Aurora kinase inhibitors reveal mechanisms of ...,"Jiun-Ming Wu, et al. Proc Natl Acad Sci U S A....",159643,0.055546,Models for nucleation of centrosomal and kinet...,Homo sapiens,2013,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
79948,PMC6770832__cancers-11-01236-g005.jpg,Figure 5,A simplified TGFβ pathway leading to p21 expre...,PMC6770832,Relevance of Non-Targeted Effects for Radiothe...,"Carmel Mothersill, et al. Cancers (Basel). 201...",618,0.140041,Simplified TGFB pathway leading to p21 expression,Homo sapiens,2019,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...


In [None]:
chemicals_df = rds2pandas(chemicals_rds_path)
chemicals_by_pfocr_id = dict()
for pfocr_id, df in chemicals_df[["pfocr_id", "identifier"]].groupby(
    "pfocr_id"
):
    # TODO: could any of these be something other than MESH IDs?
    chemicals_by_pfocr_id[pfocr_id] = set(df["identifier"].dropna())

In [None]:
diseases_df = rds2pandas(diseases_rds_path)
diseases_by_pfocr_id = dict()
for pfocr_id, df in diseases_df[["pfocr_id", "identifier"]].groupby("pfocr_id"):
    # TODO: could any of these be something other than MESH IDs?
    diseases_by_pfocr_id[pfocr_id] = set(df["identifier"].dropna())

In [12]:
genes_df = rds2pandas(genes_rds_path).rename(
    columns={
        "figid": "pfocr_id",
        "pmcid": "pmc_id",
        "entrez": "ncbigene_id",
        "word": "matched_ocr_text",
        "symbol": "lexicon_term",
        "source": "lexicon_term_source",
    }
)
genes_by_pfocr_id = dict()
for pfocr_id, df in genes_df[["pfocr_id", "ncbigene_id"]].groupby("pfocr_id"):
    genes_by_pfocr_id[pfocr_id] = set(df["ncbigene_id"].dropna())

## Export

https://github.com/wikipathways/pathway-figure-ocr/issues/16#issuecomment-1075574192
```json
{
  "_id": "PMC0000000__nihm00000000",
  "associatedWith": {
    "title": "signaling in immune response",
    "figureUrl": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC0000000/bin/nihm00000000.jpg",
    "pmc": "PMC0000000",
    "pubmed": "00000000",
    "mentions": {
        "diseases": {
              "mesh": ["D000860", "D000333"]
        },
        "chemicals": {
               "mesh": ["D0001", "D00002"]
        },
        "genes": {
               "ncbigene": ["1234", "2345"]
        }
     }
}
```

In [18]:
# https://stackoverflow.com/a/8230505/5354298
class SetEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, set):
            return list(obj)
        return json.JSONEncoder.default(self, obj)


export_dir = current_dir.joinpath("export")
if not export_dir.exists():
    export_dir.mkdir()

with open(export_dir.joinpath("bte_chemicals_diseases_genes.ndjson"), "w") as f:
    for i, row in figures_df.iterrows():
        pfocr_id = row["pfocr_id"]

        chemicals = chemicals_by_pfocr_id.get(pfocr_id, set())
        diseases = diseases_by_pfocr_id.get(pfocr_id, set())
        genes = genes_by_pfocr_id.get(pfocr_id, set())

        if chemicals or diseases or genes:
            f.write(
                json.dumps(
                    {
                        "_id": pfocr_id,
                        "associatedWith": {
                            "title": row["figure_title"],
                            "figureUrl": row["figure_thumbnail_url"],
                            "pmc": row["pmc_id"],
                            # "pubmed": "",
                            "mentions": {
                                "chemicals": {"mesh": chemicals},
                                "diseases": {"mesh": diseases},
                                "genes": {"ncbigene": genes},
                            },
                        },
                    },
                    cls=SetEncoder,
                )
            )

            f.write("\n")

In [19]:
!head export/bte_chemicals_diseases_genes.ndjson

{"_id": "PMC5732092__cshperspect-CYT-028522_F2.jpg", "associatedWith": {"title": "Interleukin (IL)-17RA/RC signaling pathways", "figureUrl": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5732092/bin/cshperspect-CYT-028522_F2.jpg", "pmc": "PMC5732092", "mentions": {"chemicals": {"mesh": []}, "diseases": {"mesh": []}, "genes": {"ncbigene": ["84818", "5196", "10131", "7188", "6387", "4791", "1994", "4283", "2920", "6426", "5966", "5473", "4117", "1432", "7189", "6374", "7186", "6300", "5594", "3627", "2919", "3326", "1051", "5601", "4790", "10758", "5603", "6373", "10563", "5600", "3576", "23765", "2921", "5602", "3605", "5971", "6372", "5599", "9547", "5595", "112744", "7184", "5970"]}}}}
{"_id": "PMC5793760__cshperspect-TGF-022210_F4.jpg", "associatedWith": {"title": "TGFB signaling pathways", "figureUrl": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5793760/bin/cshperspect-TGF-022210_F4.jpg", "pmc": "PMC5793760", "mentions": {"chemicals": {"mesh": []}, "diseases": {"mesh": []}, "genes