# Merge 2020 and 2021 Results

In [1]:
import json
import os
import re
import sys
import tempfile
from pathlib import Path
from pprint import pprint

import numpy as np
import pandas as pd
import requests
import requests_cache

In [2]:
requests_cache.install_cache("pfocr_cache")

## Import Data 2020

In [23]:
from functools import partial

import rpy2.robjects as ro
from rpy2.ipython import html
from rpy2.robjects import default_converter, pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.lib.dplyr import DataFrame
from rpy2.robjects.packages import importr

html.html_rdataframe = partial(html.html_rdataframe, table_class="docutils")

In [24]:
pandas2ri.activate()
base = importr("base")
readRDS = ro.r["readRDS"]
saveRDS = ro.r["saveRDS"]



In [25]:
def rds2pandas(rds_path):
    r_df = readRDS(str(rds_path))
    with localconverter(ro.default_converter + pandas2ri.converter):
        pandas_df = ro.conversion.rpy2py(r_df)
    return pandas_df

In [26]:
def pandas2rds(pandas_df, rds_path):
    with localconverter(default_converter + pandas2ri.converter) as cv:
        r_df = DataFrame(pandas_df)

    saveRDS(r_df, str(rds_path))

### Import Data 2020 ➜ Figures

In [6]:
pfocr_figures_2020_rds_url = (
    "https://www.dropbox.com/s/qhc33zho78rnaoj/pfocr_figures.rds?dl=1"
)

with tempfile.NamedTemporaryFile(suffix=".rds") as f:
    pfocr_figures_2020_rds_path = f.name
    with requests.get(pfocr_figures_2020_rds_url, stream=True) as r:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
        f.seek(0)
    pfocr_figures_2020_df = rds2pandas(pfocr_figures_2020_rds_path).rename(
        columns={
            "figid": "pfocr_id",
            "pmcid": "pmc_id",
            "filename": "figure_filename",
            "number": "figure_number",
            "figtitle": "figure_title",
            "papertitle": "paper_title",
            "caption": "figure_caption",
            "figlink": "relative_figure_page_url",
            "reftext": "reference",
            "year": "publication_year",
        }
    )


pfocr_figures_2020_df["paper_url"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/"
    + pfocr_figures_2020_df["pmc_id"]
)

pfocr_figures_2020_df["figure_page_url"] = (
    "https://www.ncbi.nlm.nih.gov"
    + pfocr_figures_2020_df["relative_figure_page_url"]
)

pfocr_figures_2020_df["figure_thumbnail_url"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/"
    + pfocr_figures_2020_df["pmc_id"]
    + "/bin/"
    + pfocr_figures_2020_df["figure_filename"]
)

pfocr_figures_2020_df["pfocr_year"] = 2020

pfocr_figures_2020_df.drop(
    columns=[
        "relative_figure_page_url",
        "figure_filename",
        "source_f",
        "type.man",
        "automl_index",
    ],
    inplace=True,
)

pfocr_figures_2020_df

Unnamed: 0,pfocr_id,figure_number,reference,publication_year,pathway_score,pmc_ranked_result_index,figure_title,paper_title,figure_caption,pmc_id,paper_url,figure_page_url,figure_thumbnail_url,pfocr_year
1,PMC5653847__41598_2017_14124_Fig8_HTML.jpg,Figure 8,"Céline Barthelemy, et al. Sci Rep. 2017;7:13816.",2017,0.968270,133303,Model of FTY720-induced transporter endocytosi...,FTY720-induced endocytosis of yeast and human ...,Model of FTY720-induced transporter endocytosi...,PMC5653847,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,2020
2,PMC4187043__zh20191474070013.jpg,Fig. 13,"Yuan Wei, et al. Am J Physiol Renal Physiol. 2...",2014,0.965793,79929,Proposed signaling pathway by which the stimul...,Angiotensin II type 2 receptor regulates ROMK-...,Proposed signaling pathway by which the stimul...,PMC4187043,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,2020
3,PMC5746550__rsob-7-170228-g1.jpg,Figure 1,"Georgia R. Frost, et al. Open Biol. 2017 Dec;7...",2017,0.962470,98034,AŒ≤ production,The role of astrocytes in amyloid production a...,AŒ≤ production. In the amyloidogenic pathway (...,PMC5746550,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,2020
4,PMC4211692__pone.0110875.g008.jpg,Figure 8,"Enida Gjoni, et al. PLoS One. 2014;9(10):e110875.",2014,0.966721,142401,,Glucolipotoxicity Impairs Ceramide Flow from t...,Glucolipotoxicity impairs CERT- and vesicular-...,PMC4211692,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,2020
5,PMC2588433__nihms78212f8.jpg,Figure 8,"Amanda L. Lewis, et al. J Biol Chem. ;282(38):...",,0.966758,67398,,NeuA sialic acid O-acetylesterase activity mod...,Bacterial Sia biosynthesis can be divided into...,PMC2588433,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64639,PMC4216988__zh20221474360006.jpg,Fig. 6,"Marcelo D. Carattino, et al. Am J Physiol Rena...",2014,0.143076,108774,Hypothetical mechanism of activation of ENaC b...,Prostasin interacts with the epithelial Na+ ch...,Hypothetical mechanism of activation of ENaC b...,PMC4216988,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,2020
64640,PMC2873070__nihms128887f5.jpg,Scheme 1,"Hua Cheng, et al. Neurobiol Aging. ;31(7):1188...",,0.127176,143547,A schematic diagram of a proposed working mode...,Apolipoprotein E mediates sulfatide depletion ...,A schematic diagram of a proposed working mode...,PMC2873070,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,2020
64641,PMC3651446__pnas.1220523110fig06.jpg,Fig. 6,"Jiun-Ming Wu, et al. Proc Natl Acad Sci U S A....",2013,0.055546,159643,Models for nucleation of centrosomal and kinet...,Aurora kinase inhibitors reveal mechanisms of ...,Models for nucleation of centrosomal and kinet...,PMC3651446,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,2020
64642,PMC6770832__cancers-11-01236-g005.jpg,Figure 5,"Carmel Mothersill, et al. Cancers (Basel). 201...",2019,0.140041,618,A simplified TGFŒ≤ pathway leading to p21 expr...,Relevance of Non-Targeted Effects for Radiothe...,A simplified TGFŒ≤ pathway leading to p21 expr...,PMC6770832,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,2020


In [7]:
print(len(pfocr_figures_2020_df.columns))
pfocr_figures_2020_df.columns

14


Index(['pfocr_id', 'figure_number', 'reference', 'publication_year',
       'pathway_score', 'pmc_ranked_result_index', 'figure_title',
       'paper_title', 'figure_caption', 'pmc_id', 'paper_url',
       'figure_page_url', 'figure_thumbnail_url', 'pfocr_year'],
      dtype='object')

### Import Data 2020 ➜ Genes

In [8]:
# genes
pfocr_genes_2020_rds_url = (
    "https://www.dropbox.com/s/alf7auvxve36oer/pfocr_genes.rds?dl=1"
)

with tempfile.NamedTemporaryFile(suffix=".rds") as f:
    pfocr_genes_2020_rds_path = f.name
    with requests.get(pfocr_genes_2020_rds_url, stream=True) as r:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
        f.seek(0)
    pfocr_genes_2020_df = rds2pandas(pfocr_genes_2020_rds_path).rename(
        columns={
            "figid": "pfocr_id",
            "pmcid": "pmc_id",
            "entrez": "ncbigene_id",
            "word": "ocr_text",
            "symbol": "lexicon_entry",
            "source": "lexicon_entry_source",
        }
    )

pfocr_genes_2020_df["pfocr_year"] = 2020

pfocr_genes_2020_df

Unnamed: 0,pfocr_id,pmc_id,ocr_text,lexicon_entry,lexicon_entry_source,hgnc_symbol,ncbigene_id,pfocr_year
1,PMC100003__mb2410470011.jpg,PMC100003,"Ga12,Gaq",G-ALPHA-q,hgnc_alias_symbol,GNAQ,2776,2020
2,PMC100003__mb2410470011.jpg,PMC100003,Etk,ETK,hgnc_alias_symbol,BMX,660,2020
3,PMC100003__mb2410470011.jpg,PMC100003,FAK,FAK,hgnc_alias_symbol,PTK2,5747,2020
4,PMC100003__mb2410470011.jpg,PMC100003,AR*,AR,hgnc_symbol,AR,367,2020
5,PMC100003__mb2410470011.jpg,PMC100003,(Src,SRC,hgnc_symbol,SRC,6714,2020
...,...,...,...,...,...,...,...,...
1112547,PMC99976__mb2310138007.jpg,PMC99976,MEK-2,MEK2,hgnc_alias_symbol,MAP2K2,5605,2020
1112548,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,HRAS,3265,2020
1112549,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,KRAS,3845,2020
1112550,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,NRAS,4893,2020


## Import Data 2021

### Import Data 2021 ➜ Figures

In [9]:
target_date = "20210513"
images_dir = Path(f"../data/images/{target_date}")

In [10]:
pfocr_figures_2021_df = rds2pandas(
    images_dir.joinpath("pfocr_figures_2021.rds")
)

# pfocr_figures_2021_df["pfocr_year"] = 2020


pfocr_figures_2021_df

Unnamed: 0,pfocr_id,figure_page_url,figure_thumbnail_url,figure_number,figure_title,figure_caption,pmc_id,paper_url,paper_title,reference,pmc_search_index,pathway_score,pfocr_year
11,PMC7226520__cells-09-01043-g007.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Figure 7,Comparative downstream pathway analysis of the...,Comparative downstream pathway analysis of the...,PMC7226520,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,"A STAT3 of Addiction: Adipose Tissue, Adipocyt...","Rose Kadye, et al. Cells. 2020 Apr;9(4):1043.",12,0.811027,2021
16,PMC7346062__aging-12-103262-g005.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Figure 5,KEGG pathways related to resveratrol-targeted ...,KEGG pathways related to resveratrol-targeted ...,PMC7346062,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Resveratrol promotes osteogenesis and alleviat...,"Tao Yu, et al. Aging (Albany NY). 2020 Jun 15;...",17,0.943144,2021
21,PMC7063815__13578_2020_396_Fig2_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Fig. 2,Interaction between tumor metabolism and the m...,Interaction between tumor metabolism and the m...,PMC7063815,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,mTOR signaling pathway and mTOR inhibitors in ...,"Zhilin Zou, et al. Cell Biosci. 2020;10:31.",22,0.943767,2021
22,PMC6497965__zbc0191904900006.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Figure 6,A proposed signaling cascade for phosphorylate...,A proposed signaling cascade for phosphorylate...,PMC6497965,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Phosphorylation of proliferating cell nuclear ...,"Bo Peng, et al. J Biol Chem. 2019 Apr 26;294(1...",23,0.603429,2021
23,PMC7538683__bsr-40-bsr20202711-g5.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Figure 5,,(A) Cell cycle signaling pathway is significan...,PMC7538683,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Bioinformatics analysis and experimental valid...,"Jiajia Chen, et al. Biosci Rep. 2020 Oct 30;40...",24,0.863582,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...
124414,PMC7803631__gr3a.jpg,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,,Fig,Cell free biosynthesis for erythromycin A. The...,PMC7803631,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Complex natural product production methods and...,"Dongwon Park, et al. Synth Syst Biotechnol. 20...",124515,0.648432,2021
124421,PMC7359798__gr2_lrg.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Figure 2,RNA Sensing and ResponseNon-comprehensive over...,RNA Sensing and ResponseNon-comprehensive over...,PMC7359798,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Immune Sensing Mechanisms that Discriminate Se...,"Eva Bartok, et al. Immunity. 2020 Jul 14;53(1)...",124522,0.922250,2021
124439,PMC7530268__fonc-10-586530-g0001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Figure 1,The ferroptotic cascade,The ferroptotic cascade. Accumulation of free ...,PMC7530268,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,From Iron Chelation to Overload as a Therapeut...,"Eric Grignano, et al. Front Oncol. 2020;10:586...",124540,0.952995,2021
124445,PMC7466447__fcell-08-00766-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,FIGURE 1,Immunoregulatory functions of TEC in the TME,Immunoregulatory functions of TEC in the TME. ...,PMC7466447,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Tumor Endothelial Cells (TECs) as Potential Im...,"Laurenz Nagl, et al. Front Cell Dev Biol. 2020...",124546,0.933006,2021


In [11]:
print(len(pfocr_figures_2021_df.columns))
pfocr_figures_2021_df.columns

13


Index(['pfocr_id', 'figure_page_url', 'figure_thumbnail_url', 'figure_number',
       'figure_title', 'figure_caption', 'pmc_id', 'paper_url', 'paper_title',
       'reference', 'pmc_search_index', 'pathway_score', 'pfocr_year'],
      dtype='object')

In [12]:
print(len(pfocr_figures_2020_df.columns))
pfocr_figures_2020_df.columns

14


Index(['pfocr_id', 'figure_number', 'reference', 'publication_year',
       'pathway_score', 'pmc_ranked_result_index', 'figure_title',
       'paper_title', 'figure_caption', 'pmc_id', 'paper_url',
       'figure_page_url', 'figure_thumbnail_url', 'pfocr_year'],
      dtype='object')

### Import Data 2021 ➜ Genes

In [17]:
pfocr_genes_2021_df = (
    rds2pandas(images_dir.joinpath("results202105241041.rds"))
    .rename(
        columns={
            "figure_id": "pfocr_id",
            "word": "ocr_text",
            "transformed_word": "lexicon_entry",
        }
    )
    .drop(columns=["transforms_applied"])
)

pfocr_genes_2021_df["pfocr_year"] = 2021

pfocr_genes_2021_df

Unnamed: 0,pfocr_id,ncbigene_id,ocr_text,lexicon_entry,pfocr_year
0,PMC8036963__cancers-13-01583-g003.jpg,3014,H2AX,H2AX,2021
1,PMC8036963__cancers-13-01583-g003.jpg,3014,H2AX,H2AX,2021
2,PMC8036963__cancers-13-01583-g003.jpg,2547,Ku70/80,KU70,2021
3,PMC8036963__cancers-13-01583-g003.jpg,7520,Ku70/80,KU80,2021
4,PMC8036963__cancers-13-01583-g003.jpg,472,ATM,ATM,2021
...,...,...,...,...,...
214150,PMC7927090__ijms-22-02194-g002.jpg,207,AKT,AKT,2021
214151,PMC7927090__ijms-22-02194-g002.jpg,4092,Smad,SMAD,2021
214152,PMC7927090__ijms-22-02194-g002.jpg,2475,MTOR,MTOR,2021
214153,PMC7927090__ijms-22-02194-g002.jpg,3791,VEGFR2,VEGFR2,2021


## Merge 2020 and 2021

In [None]:
data_dir = Path("~/Dropbox (Gladstone)/Documents/pathway-ocr/20210515/")

In [21]:
pfocr_figures_df = (
    pfocr_figures_2020_df.append(pfocr_figures_2021_df)
    .reset_index(drop=True)
    .drop(columns=["pmc_search_index"])
)
pandas2rds(pfocr_figures_df, data_dir.joinpath("pfocr_figures_v2.rds"))
pfocr_figures_df

Unnamed: 0,pfocr_id,figure_number,reference,publication_year,pathway_score,pmc_ranked_result_index,figure_title,paper_title,figure_caption,pmc_id,paper_url,figure_page_url,figure_thumbnail_url,pfocr_year
0,PMC5653847__41598_2017_14124_Fig8_HTML.jpg,Figure 8,"Céline Barthelemy, et al. Sci Rep. 2017;7:13816.",2017,0.968270,133303.0,Model of FTY720-induced transporter endocytosi...,FTY720-induced endocytosis of yeast and human ...,Model of FTY720-induced transporter endocytosi...,PMC5653847,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,2020
1,PMC4187043__zh20191474070013.jpg,Fig. 13,"Yuan Wei, et al. Am J Physiol Renal Physiol. 2...",2014,0.965793,79929.0,Proposed signaling pathway by which the stimul...,Angiotensin II type 2 receptor regulates ROMK-...,Proposed signaling pathway by which the stimul...,PMC4187043,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,2020
2,PMC5746550__rsob-7-170228-g1.jpg,Figure 1,"Georgia R. Frost, et al. Open Biol. 2017 Dec;7...",2017,0.962470,98034.0,AŒ≤ production,The role of astrocytes in amyloid production a...,AŒ≤ production. In the amyloidogenic pathway (...,PMC5746550,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,2020
3,PMC4211692__pone.0110875.g008.jpg,Figure 8,"Enida Gjoni, et al. PLoS One. 2014;9(10):e110875.",2014,0.966721,142401.0,,Glucolipotoxicity Impairs Ceramide Flow from t...,Glucolipotoxicity impairs CERT- and vesicular-...,PMC4211692,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,2020
4,PMC2588433__nihms78212f8.jpg,Figure 8,"Amanda L. Lewis, et al. J Biol Chem. ;282(38):...",,0.966758,67398.0,,NeuA sialic acid O-acetylesterase activity mod...,Bacterial Sia biosynthesis can be divided into...,PMC2588433,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80593,PMC7803631__gr3a.jpg,,"Dongwon Park, et al. Synth Syst Biotechnol. 20...",,0.648432,,Fig,Complex natural product production methods and...,Cell free biosynthesis for erythromycin A. The...,PMC7803631,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,2021
80594,PMC7359798__gr2_lrg.jpg,Figure 2,"Eva Bartok, et al. Immunity. 2020 Jul 14;53(1)...",,0.922250,,RNA Sensing and ResponseNon-comprehensive over...,Immune Sensing Mechanisms that Discriminate Se...,RNA Sensing and ResponseNon-comprehensive over...,PMC7359798,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,2021
80595,PMC7530268__fonc-10-586530-g0001.jpg,Figure 1,"Eric Grignano, et al. Front Oncol. 2020;10:586...",,0.952995,,The ferroptotic cascade,From Iron Chelation to Overload as a Therapeut...,The ferroptotic cascade. Accumulation of free ...,PMC7530268,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,2021
80596,PMC7466447__fcell-08-00766-g001.jpg,FIGURE 1,"Laurenz Nagl, et al. Front Cell Dev Biol. 2020...",,0.933006,,Immunoregulatory functions of TEC in the TME,Tumor Endothelial Cells (TECs) as Potential Im...,Immunoregulatory functions of TEC in the TME. ...,PMC7466447,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,2021


In [22]:
pfocr_genes_df = pfocr_genes_2020_df.append(pfocr_genes_2021_df).reset_index(
    drop=True
)
pandas2rds(pfocr_genes_df, data_dir.joinpath("pfocr_genes_v2.rds"))
pfocr_genes_df

Unnamed: 0,pfocr_id,pmc_id,ocr_text,lexicon_entry,lexicon_entry_source,hgnc_symbol,ncbigene_id,pfocr_year
0,PMC100003__mb2410470011.jpg,PMC100003,"Ga12,Gaq",G-ALPHA-q,hgnc_alias_symbol,GNAQ,2776,2020
1,PMC100003__mb2410470011.jpg,PMC100003,Etk,ETK,hgnc_alias_symbol,BMX,660,2020
2,PMC100003__mb2410470011.jpg,PMC100003,FAK,FAK,hgnc_alias_symbol,PTK2,5747,2020
3,PMC100003__mb2410470011.jpg,PMC100003,AR*,AR,hgnc_symbol,AR,367,2020
4,PMC100003__mb2410470011.jpg,PMC100003,(Src,SRC,hgnc_symbol,SRC,6714,2020
...,...,...,...,...,...,...,...,...
1326701,PMC7927090__ijms-22-02194-g002.jpg,,AKT,AKT,,,207,2021
1326702,PMC7927090__ijms-22-02194-g002.jpg,,Smad,SMAD,,,4092,2021
1326703,PMC7927090__ijms-22-02194-g002.jpg,,MTOR,MTOR,,,2475,2021
1326704,PMC7927090__ijms-22-02194-g002.jpg,,VEGFR2,VEGFR2,,,3791,2021


In [30]:
pfocr_genes_df["ncbigene_id"].drop_duplicates()

0            2776
1             660
2            5747
3             367
4            6714
            ...  
1324495     10526
1325328    158763
1325771       754
1325952     57048
1326255     79731
Name: ncbigene_id, Length: 14251, dtype: int32

In [32]:
pfocr_genes_df["pfocr_id"].drop_duplicates()

0                        PMC100003__mb2410470011.jpg
5                        PMC100005__mb2410575011.jpg
21                       PMC100008__mb2411709009.jpg
73                    PMC101225__1475-2867-1-1-1.jpg
74                           PMC101242__gkf20707.jpg
                             ...                    
1326624         PMC7761438__plants-09-01639-g001.jpg
1326645         PMC7275722__pnas.2006106117fig04.jpg
1326652    PMC7788892__12864_2020_7266_Fig5_HTML.jpg
1326654          PMC7670535__JAH3-9-e016615-g004.jpg
1326672           PMC7927090__ijms-22-02194-g002.jpg
Name: pfocr_id, Length: 73878, dtype: object

In [31]:
pfocr_figures_df["pfocr_id"].drop_duplicates()

0        PMC5653847__41598_2017_14124_Fig8_HTML.jpg
1                  PMC4187043__zh20191474070013.jpg
2                  PMC5746550__rsob-7-170228-g1.jpg
3                 PMC4211692__pone.0110875.g008.jpg
4                      PMC2588433__nihms78212f8.jpg
                            ...                    
80593                          PMC7803631__gr3a.jpg
80594                       PMC7359798__gr2_lrg.jpg
80595          PMC7530268__fonc-10-586530-g0001.jpg
80596           PMC7466447__fcell-08-00766-g001.jpg
80597         PMC7352181__cancers-12-01457-g002.jpg
Name: pfocr_id, Length: 79949, dtype: object

### Import Data 2020 ➜ Merge

In [14]:
pfocr_df = (
    (
        pfocr_figures_df.join(
            pfocr_genes_df.drop(columns=["pmc_id"]).set_index("pfocr_id"),
            on="pfocr_id",
            how="inner",
        )
    )
    .sort_values(["publication_year", "pmc_id", "pfocr_id", "ncbigene_id"])
    .reset_index(drop=True)
)
pfocr_df

Unnamed: 0,pfocr_id,figure_number,citation,publication_year,pathway_score,pmc_ranked_result_index,figure_title,paper_title,figure_caption,pmc_id,...,automl_index,paper_url,figure_page_url,figure_thumbnail_url,pfocr_year,ocr_text,lexicon_match,lexicon_match_source,hgnc_symbol,ncbigene_id
0,PMC6134364__GE-4-357-g005.jpg,FIG. 5,Joe B. Harford. Gene Expr. 1995;4(6):357-367.,1995,0.882552,115615,The PKR pathway,Translation-Targeted Therapeutics for Viral Di...,The PKR pathway. Double-stranded RNA (dsRNA) r...,PMC6134364,...,42272,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,2020,elF-2-,ELF2,hgnc_symbol,ELF2,1998
1,PMC6134364__GE-4-357-g005.jpg,FIG. 5,Joe B. Harford. Gene Expr. 1995;4(6):357-367.,1995,0.882552,115615,The PKR pathway,Translation-Targeted Therapeutics for Viral Di...,The PKR pathway. Double-stranded RNA (dsRNA) r...,PMC6134364,...,42272,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,2020,Interferon,Interferon,bioentities_symbol,IFNA1,3439
2,PMC6134364__GE-4-357-g005.jpg,FIG. 5,Joe B. Harford. Gene Expr. 1995;4(6):357-367.,1995,0.882552,115615,The PKR pathway,Translation-Targeted Therapeutics for Viral Di...,The PKR pathway. Double-stranded RNA (dsRNA) r...,PMC6134364,...,42272,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,2020,Interferon,Interferon,bioentities_symbol,IFNA2,3440
3,PMC6134364__GE-4-357-g005.jpg,FIG. 5,Joe B. Harford. Gene Expr. 1995;4(6):357-367.,1995,0.882552,115615,The PKR pathway,Translation-Targeted Therapeutics for Viral Di...,The PKR pathway. Double-stranded RNA (dsRNA) r...,PMC6134364,...,42272,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,2020,Interferon,Interferon,bioentities_symbol,IFNA4,3441
4,PMC6134364__GE-4-357-g005.jpg,FIG. 5,Joe B. Harford. Gene Expr. 1995;4(6):357-367.,1995,0.882552,115615,The PKR pathway,Translation-Targeted Therapeutics for Viral Di...,The PKR pathway. Double-stranded RNA (dsRNA) r...,PMC6134364,...,42272,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,2020,Interferon,Interferon,bioentities_symbol,IFNA5,3442
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1112546,PMC6908429__nihms-1057672-f0004.jpg,Figure 4,"Viviana Simon, et al. Nat Immunol. ;16(6):546-...",,0.967722,131802,"Tetherin blocks virus release, activates an in...",Intrinsic host restrictions to HIV-1 and mecha...,"Tetherin blocks virus release, activates an in...",PMC6908429,...,3287,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,2020,TAK1,TAK1,hgnc_alias_symbol,NR2C2,7182
1112547,PMC6908429__nihms-1057672-f0004.jpg,Figure 4,"Viviana Simon, et al. Nat Immunol. ;16(6):546-...",,0.967722,131802,"Tetherin blocks virus release, activates an in...",Intrinsic host restrictions to HIV-1 and mecha...,"Tetherin blocks virus release, activates an in...",PMC6908429,...,3287,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,2020,"TRAF2,6",TRAF2,hgnc_symbol,TRAF2,7186
1112548,PMC6908429__nihms-1057672-f0004.jpg,Figure 4,"Viviana Simon, et al. Nat Immunol. ;16(6):546-...",,0.967722,131802,"Tetherin blocks virus release, activates an in...",Intrinsic host restrictions to HIV-1 and mecha...,"Tetherin blocks virus release, activates an in...",PMC6908429,...,3287,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,2020,"TRAF2,6",TRAF6,hgnc_symbol,TRAF6,7189
1112549,PMC6908429__nihms-1057672-f0004.jpg,Figure 4,"Viviana Simon, et al. Nat Immunol. ;16(6):546-...",,0.967722,131802,"Tetherin blocks virus release, activates an in...",Intrinsic host restrictions to HIV-1 and mecha...,"Tetherin blocks virus release, activates an in...",PMC6908429,...,3287,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,2020,IKK,IKK,bioentities_symbol,IKBKG,8517


In [17]:
gene_counts_df = (
    pfocr_df[["pfocr_id", "ncbigene_id"]]
    .drop_duplicates()
    .groupby("pfocr_id")
    .count()
    .rename(columns={"ncbigene_id": "unique_gene_count"})
)
print(len(gene_counts_df))
pfocr_df = pfocr_df.join(gene_counts_df, on="pfocr_id")

58962


### Import Data 2021 ➜ Genes

Limit to figures having 10 or more unique genes.

In [62]:
cutoff_pfocr_df = pfocr_df[pfocr_df["unique_gene_count"] >= 10][
    ["pfocr_id", "figtitle", "entrez"]
].drop_duplicates()
cutoff_pfocr_df

Unnamed: 0,figid,figtitle,entrez
16,PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,2774
17,PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,2778
18,PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,5330
19,PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,5331
20,PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,5332
...,...,...,...
1112546,PMC6906418__12276_2019_299_Fig2_HTML.jpg,,90933
1112547,PMC6906418__12276_2019_299_Fig2_HTML.jpg,,91445
1112548,PMC6906418__12276_2019_299_Fig2_HTML.jpg,,115004
1112549,PMC6906418__12276_2019_299_Fig2_HTML.jpg,,284076


In [63]:
print(len(cutoff_pfocr_df["pfocr_id"].unique()))
print(len(cutoff_pfocr_df["figtitle"].unique()))
print(len(cutoff_pfocr_df["entrez"].unique()))

32277
23450
13153


## Export GMT

[GMT format description](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29).

Sample of what we want:
```
PMC100005__mb2410575011.jpg     Formation of TrkA–src-aPKC complexes    5293    5294    25759   6714    8503    4914    10818   399694  30849   7189      23533   5290    6464    146850  53358   5291
```

TODO: there are some characters that could mess up the parsing of a GMT. For example, a `%` or a `\t` would be parsed as a delimiter. Also, if a doublequote `"` can be used to enclose fields with characters like spaces, then a title containing a doublequote would mess up a parser.

In those cases, do we want to escape the special characters or remove them?

In [12]:
def genes_to_gmt(df):
    return df["entrez"].astype("str").str.cat(sep="\t")

In [84]:
gmt_df = (
    cutoff_pfocr_df[["pfocr_id", "figtitle"]]
    .drop_duplicates()
    .set_index("pfocr_id")
    .copy(deep=True)
)
gmt_df["genes"] = (
    cutoff_pfocr_df.set_index("pfocr_id")
    .groupby("pfocr_id")
    .apply(lambda x: x["entrez"].astype("str").to_list())
)
gmt_df["genes_str"] = (
    cutoff_pfocr_df.set_index("pfocr_id")
    .groupby("pfocr_id")
    .apply(genes_to_gmt)
)
gmt_df = gmt_df.fillna("")

gmt_df

Unnamed: 0_level_0,figtitle,genes,genes_str
figid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,"[2774, 2778, 5330, 5331, 5332, 5333, 5335, 533...",2774\t2778\t5330\t5331\t5332\t5333\t5335\t5336...
PMC1307498__nihms5498f3.jpg,Schematic model of insulin-stimulated glucose ...,"[207, 208, 3667, 5163, 5164, 5165, 5166, 5290,...",207\t208\t3667\t5163\t5164\t5165\t5166\t5290\t...
PMC1307511__nihms2079f3.jpg,,"[317, 834, 835, 836, 837, 838, 839, 840, 841, ...",317\t834\t835\t836\t837\t838\t839\t840\t841\t8...
PMC1351030__nihms2404f8.jpg,A model of signaling pathway from oncogenic H-...,"[595, 688, 1432, 1958, 3265, 5594, 5595, 5599,...",595\t688\t1432\t1958\t3265\t5594\t5595\t5599\t...
PMC1352153__nihms-7536-0007.jpg,Model for GalR1 neuroplasticity within the LC,"[466, 1385, 1386, 1390, 2587, 4236, 5566, 5567...",466\t1385\t1386\t1390\t2587\t4236\t5566\t5567\...
...,...,...,...
PMC6904862__WJSC-11-1104-g003.jpg,Biomechanical stress stimulates the mechanosen...,"[207, 208, 1432, 5594, 5595, 5599, 5600, 5601,...",207\t208\t1432\t5594\t5595\t5599\t5600\t5601\t...
PMC6904864__WJSC-11-1084-g001.jpg,Small molecules guiding mesenchymal stem cell ...,"[207, 208, 649, 650, 651, 652, 653, 654, 655, ...",207\t208\t649\t650\t651\t652\t653\t654\t655\t6...
PMC6905007__40246_2019_252_Fig2_HTML.jpg,LRF/ZBTB7A‚Äôs silencing compromises Warburg e...,"[355, 596, 834, 835, 836, 837, 838, 839, 840, ...",355\t596\t834\t835\t836\t837\t838\t839\t840\t8...
PMC6906418__12276_2019_299_Fig1_HTML.jpg,,"[834, 835, 836, 837, 838, 839, 840, 841, 842, ...",834\t835\t836\t837\t838\t839\t840\t841\t842\t8...


Let's export the GMT file. The paths match AR's system, but you'll need to update them if yours aren't the same.

In [85]:
dropbox_dir = Path("~/Dropbox (Gladstone)").expanduser().resolve()
paper_dir = (
    dropbox_dir.joinpath("Documents/PFOCR_25Years").expanduser().resolve()
)
data_dir = paper_dir
# data_dir = paper_dir.joinpath("raw")

if not data_dir.exists():
    tmp_dir = tempfile.TemporaryDirectory()
    # can explicitly remove tmp_dir with tmp_dir.cleanup()
    data_dir = Path(tmp_dir.name)

gmt_path = data_dir.joinpath("pfocr_curated.gmt")

with open(gmt_path, "w") as gmt_f:
    for pfocr_id, row in gmt_df.iterrows():
        gmt_f.write(
            "\t".join(
                [
                    pfocr_id,
                    row["figtitle"],
                    row["genes_str"],
                ]
            )
        )
        gmt_f.write("\n")

print(f"Output saved: {gmt_path}")

Output saved: /home/ariutta/Dropbox (Gladstone)/Documents/PFOCR_25Years/pfocr_curated.gmt


In [86]:
ar_gmt_df = gmt_df.reset_index()

In [87]:
ap_gmt_path = paper_dir.joinpath("raw/pfocr_curated_ap.gmt")
ap_gmt_data = list()
with open(ap_gmt_path, "r") as gmt_f:
    for line in gmt_f:
        chunks = line.strip().split("\t")
        pfocr_id = chunks.pop(0)
        figtitle = chunks.pop(0)
        genes = chunks
        ap_gmt_data.append(
            {"pfocr_id": pfocr_id, "figtitle": figtitle, "genes": genes}
        )
ap_gmt_df = pd.DataFrame.from_records(ap_gmt_data)
ap_gmt_df

Unnamed: 0,figid,figtitle,genes
0,PMC100005__mb2410575011.jpg,Formation of TrkA–src-aPKC complexes,"[5293, 5294, 25759, 6714, 8503, 4914, 10818, 3..."
1,PMC100008__mb2411709009.jpg,Cooperation between Beta-AR and B2R signaling ...,"[5331, 5576, 51196, 23236, 5567, 5566, 107, 11..."
2,PMC101242__gkf20707.jpg,Up-regulation of RAD51 in 8-MOP-treated cells,"[5883, 5980, 11200, 5893, 10111, 5888, 546, 47..."
3,PMC101600__ac1100359003.jpg,Hypothetical biosynthetic pathway of coumermyc...,"[125965, 84701, 1327, 10063, 1352, 1353, 1351,..."
4,PMC102186__mb2400926009.jpg,Ras-mediated signaling pathways,"[6300, 5602, 6300, 5879, 5058, 5062, 57144, 58..."
...,...,...,...
32274,PMC99537__jb1610128005.jpg,Basic metabolic pathways in C,"[23038, 4236, 10603, 788, 5885, 9126, 6576, 12..."
32275,PMC99857__mb2010895009.jpg,"The B(a)P-induced apoptotic signaling pathway,...","[9459, 64374, 837, 100506742, 23581, 834, 835,..."
32276,PMC99889__mb2110211013.jpg,Erk activation following TCR stimulation,"[5594, 5595, 2885, 7535, 5604, 27040, 5327, 39..."
32277,PMC99957__mb2310813001.jpg,Purine metabolism in S,"[23038, 4236, 355, 22978, 5471, 2987, 1015, 83..."


The results from AP and from AR are not identical matches.

In [148]:
weird_ap = set(ap_gmt_df["pfocr_id"]) - set(ar_gmt_df["pfocr_id"])
print(len(weird_ap))
weird_ar = set(ar_gmt_df["pfocr_id"]) - set(ap_gmt_df["pfocr_id"])
print(len(weird_ar))

311
309


Sample of results in AP but not in AR.

In [89]:
ap_gmt_df[ap_gmt_df["pfocr_id"].isin(weird_ap)].head()

Unnamed: 0,figid,figtitle,genes
15,PMC1069556__zjv0080560770006.jpg,Effect of NS5A on the Beta-catenin signaling p...,"[7008, 2932, 572, 9733, 1499, 2308, 10000, 207..."
19,PMC107855__ii0180505007.jpg,Influence of OdDHL on the immune system,"[3592, 3576, 3458, 3664, 3553, 3593, 3439, 712..."
263,PMC140507__gkg141f1.jpg,The PI 3-kinase pathway by RNAi,"[5601, 5599, 207, 5602, 5601, 5163, 5599, 5728..."
290,PMC1435660__75f8c.jpg,Hypothetical model for HHT,"[7043, 94, 7046, 94, 7042, 2022, 51175, 7046, ..."
405,PMC1544360__nihms10600f7.jpg,Simplified repression model of angptl3 promote...,"[6256, 9612, 6257, 6258, 10062, 9612, 6256, 87..."


Sample of results in AR but not in AP.

In [90]:
ar_gmt_df[ar_gmt_df["pfocr_id"].isin(weird_ar)].head()

Unnamed: 0,figid,figtitle,genes,genes_str
151,PMC2254938__nihms-38751-f0001.jpg,,"[207, 208, 3265, 3667, 3845, 4893, 8471, 8660,...",207\t208\t3265\t3667\t3845\t4893\t8471\t8660\t...
194,PMC2398704__nihms-47534-f0003.jpg,Variation the insulin/insulin-like (IIS) pathw...,"[207, 948, 3479, 3667, 5163, 5164, 5165, 5166,...",207\t948\t3479\t3667\t5163\t5164\t5165\t5166\t...
241,PMC2529154__nihms62767f4.jpg,Proposed biosynthetic pathway for MDP chromoph...,"[712, 713, 714, 715, 716, 717, 718, 727, 729, ...",712\t713\t714\t715\t716\t717\t718\t727\t729\t7...
300,PMC2585776__nihms70500f3.jpg,,"[1432, 2353, 5594, 5595, 5599, 5600, 5601, 560...",1432\t2353\t5594\t5595\t5599\t5600\t5601\t5602...
301,PMC2585776__nihms70500f5.jpg,,"[1432, 2353, 5594, 5595, 5599, 5600, 5601, 560...",1432\t2353\t5594\t5595\t5599\t5600\t5601\t5602...


In [91]:
ap_gmt_df[ap_gmt_df["pfocr_id"].isin(weird_ap)]["genes"].apply(len).max()
ap_gmt_df["genes"].apply(len).max()

385

The results from AP include some cases where entrez ids are duplicated.

In [93]:
ap_gmt_df["unique_genes"] = ap_gmt_df["genes"].apply(lambda x: list(set(x)))
ap_gmt_df[
    ap_gmt_df["genes"].apply(len) != ap_gmt_df["unique_genes"].apply(len)
].head()

Unnamed: 0,figid,figtitle,genes,unique_genes
2,PMC101242__gkf20707.jpg,Up-regulation of RAD51 in 8-MOP-treated cells,"[5883, 5980, 11200, 5893, 10111, 5888, 546, 47...","[2521, 546, 545, 10111, 5980, 5893, 472, 5883,..."
4,PMC102186__mb2400926009.jpg,Ras-mediated signaling pathways,"[6300, 5602, 6300, 5879, 5058, 5062, 57144, 58...","[391, 5062, 5605, 3845, 10298, 56924, 3725, 56..."
5,PMC104390__mb0691873009.jpg,Proposed model for translational homeostasis i...,"[5293, 8503, 2475, 5291, 5464, 30849, 146850, ...","[5290, 208, 8503, 5464, 23533, 207, 5293, 5291..."
9,PMC1052008__JCI0524178.f2.jpg,Gq/11-activated pathways in maladaptive hypert...,"[5331, 5534, 5332, 5579, 23683, 5587, 5533, 55...","[5590, 5330, 5583, 5579, 5532, 5578, 23683, 55..."
12,PMC106275__am0681773006.jpg,Analysis of the region between styR and styA,"[55811, 196883, 107, 108, 112, 115, 114, 113, ...","[113, 90527, 196883, 107, 108, 55811, 112, 263..."


In [94]:
sorted(
    ap_gmt_df[
        ap_gmt_df["genes"].apply(len) != ap_gmt_df["unique_genes"].apply(len)
    ]["genes"].iat[0]
)

['10111',
 '11200',
 '2521',
 '472',
 '545',
 '546',
 '5883',
 '5884',
 '5884',
 '5888',
 '5893',
 '5980',
 '7376']

The following should have no rows, because the results from AR shouldn't have any duplicates:

In [96]:
ar_gmt_df["unique_genes"] = ar_gmt_df["genes"].apply(lambda x: list(set(x)))
ar_gmt_df[ar_gmt_df["genes"].apply(len) != ar_gmt_df["unique_genes"].apply(len)]

Unnamed: 0,figid,figtitle,genes,genes_str,unique_genes


There are `309` hits in AR that are missing from AP, e.g., the figure with pfocr_id `PMC4929164__294_2016_565_Fig2_HTML.jpg`.

In [103]:
len(weird_ar)

309

In [102]:
list(weird_ar)[0]

'PMC4929164__294_2016_565_Fig2_HTML.jpg'

In [99]:
pfocr_genes_df[pfocr_genes_df["pfocr_id"] == list(weird_ar)[0]].head()

Unnamed: 0,figid,pmcid,word,symbol,source,hgnc_symbol,entrez
661217,PMC4929164__294_2016_565_Fig2_HTML.jpg,PMC4929164,(CDK),CDK,bioentities_symbol,CDK11A,728642
661218,PMC4929164__294_2016_565_Fig2_HTML.jpg,PMC4929164,(CDK),CDK,bioentities_symbol,CDK8,1024
661219,PMC4929164__294_2016_565_Fig2_HTML.jpg,PMC4929164,(CDK),CDK,bioentities_symbol,CDK7,1022
661220,PMC4929164__294_2016_565_Fig2_HTML.jpg,PMC4929164,(CDK),CDK,bioentities_symbol,CDK3,1018
661221,PMC4929164__294_2016_565_Fig2_HTML.jpg,PMC4929164,(CDK),CDK,bioentities_symbol,CDKL5,6792


The results from AP don't even contain anything for the paper:

In [109]:
ap_gmt_df[ap_gmt_df["pfocr_id"].str.contains("PMC4929164", regex=False)]

Unnamed: 0,figid,figtitle,genes,unique_genes


There are `311` hits in AP that are missing from AR, e.g., the figure with pfocr_id `PMC5752509__oncotarget-08-109135-g007.jpg`.

In [106]:
len(weird_ap)

311

In [107]:
list(weird_ap)[0]

'PMC5752509__oncotarget-08-109135-g007.jpg'

In [113]:
print(len(pfocr_genes_df[pfocr_genes_df["pfocr_id"] == list(weird_ap)[0]]))
pfocr_genes_df[pfocr_genes_df["pfocr_id"] == list(weird_ap)[0]]

10


Unnamed: 0,figid,pmcid,word,symbol,source,hgnc_symbol,entrez
847113,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,RAR,RAR,bioentities_symbol,RARB,5915
847114,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,RAR,RAR,bioentities_symbol,RARG,5916
847115,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,P38,p38,bioentities_symbol,MAPK13,5603
847116,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,P38,p38,bioentities_symbol,MAPK12,6300
847117,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,P38,p38,bioentities_symbol,MAPK14,1432
847118,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,P38,p38,bioentities_symbol,MAPK11,5600
847119,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,RARa,RARA,hgnc_symbol,RARA,5914
847120,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,Src,SRC,hgnc_symbol,SRC,6714
847121,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,RAR,RAR,bioentities_symbol,RARA,5914
847122,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,GPR40,GPR40,hgnc_prev_symbol,FFAR1,2864


The results from AR don't even contain anything for the paper:

In [110]:
ar_gmt_df[ar_gmt_df["pfocr_id"].str.contains("PMC5752509", regex=False)]

Unnamed: 0,figid,figtitle,genes,genes_str,unique_genes


In [114]:
print(len(pfocr_genes_df[pfocr_genes_df["pfocr_id"].isin(weird_ap)]))
pfocr_genes_df[pfocr_genes_df["pfocr_id"].isin(weird_ap)]

3304


Unnamed: 0,figid,pmcid,word,symbol,source,hgnc_symbol,entrez
452,PMC1069556__zjv0080560770006.jpg,PMC1069556,Tef,TEF,hgnc_symbol,TEF,7008
453,PMC1069556__zjv0080560770006.jpg,PMC1069556,GSK-3B,GSK3B,hgnc_symbol,GSK3B,2932
454,PMC1069556__zjv0080560770006.jpg,PMC1069556,Bad,BAD,hgnc_symbol,BAD,572
455,PMC1069556__zjv0080560770006.jpg,PMC1069556,p110,p110,hgnc_alias_symbol,SART3,9733
456,PMC1069556__zjv0080560770006.jpg,PMC1069556,Bcatenin,BCATENIN,bioentities_symbol,CTNNB1,1499
...,...,...,...,...,...,...,...
1112145,PMC98722__ii1010637004.jpg,PMC98722,actin,Actin,bioentities_symbol,ACTG1,71
1112146,PMC98722__ii1010637004.jpg,PMC98722,actin,Actin,bioentities_symbol,ACTG2,72
1112147,PMC98722__ii1010637004.jpg,PMC98722,Cdc42,CDC42,hgnc_symbol,CDC42,998
1112148,PMC98722__ii1010637004.jpg,PMC98722,N-WASP,N-WASP,hgnc_alias_symbol,WASL,8976


In [119]:
pfocr_genes_df

Unnamed: 0,figid,pmcid,word,symbol,source,hgnc_symbol,entrez
0,PMC100003__mb2410470011.jpg,PMC100003,"Ga12,Gaq",G-ALPHA-q,hgnc_alias_symbol,GNAQ,2776
1,PMC100003__mb2410470011.jpg,PMC100003,Etk,ETK,hgnc_alias_symbol,BMX,660
2,PMC100003__mb2410470011.jpg,PMC100003,FAK,FAK,hgnc_alias_symbol,PTK2,5747
3,PMC100003__mb2410470011.jpg,PMC100003,AR*,AR,hgnc_symbol,AR,367
4,PMC100003__mb2410470011.jpg,PMC100003,(Src,SRC,hgnc_symbol,SRC,6714
...,...,...,...,...,...,...,...
1112546,PMC99976__mb2310138007.jpg,PMC99976,MEK-2,MEK2,hgnc_alias_symbol,MAP2K2,5605
1112547,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,HRAS,3265
1112548,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,KRAS,3845
1112549,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,NRAS,4893


In [124]:
pfocr_df[pfocr_df["pfocr_id"].isin(weird_ap)]["unique_gene_count"].max()

9

In [133]:
pfocr_genes_df[pfocr_genes_df["pfocr_id"].isin(weird_ap)].groupby("pfocr_id")[
    "entrez"
].count().max()

17

In [138]:
counts_df = (
    pfocr_genes_df[pfocr_genes_df["pfocr_id"].isin(weird_ap)]
    .groupby("pfocr_id")["entrez"]
    .count()
)
counts_df.sort_values()
# counts_df.max()
# counts_df[counts_df["entrez"] == 17]

figid
PMC1069556__zjv0080560770006.jpg              10
PMC4800799__zmb9991011760007.jpg              10
PMC4815360__pr.115.011833f3.jpg               10
PMC4820813__kcll-05-04-1136374-g005.jpg       10
PMC4825016__zbc0171641000007.jpg              10
                                              ..
PMC5735158__41598_2017_16627_Fig6_HTML.jpg    15
PMC5915505__441_2018_2801_Fig2_HTML.jpg       15
PMC6272071__IJBMS-21-911-g007.jpg             15
PMC5045067__hp-2-197Fig1.jpg                  15
PMC3151466__nihms-315348-f0002.jpg            17
Name: entrez, Length: 311, dtype: int64

In [131]:
pfocr_df[pfocr_df["pfocr_id"].isin(weird_ap)][["unique_gene_count"]].max()

unique_gene_count    9
dtype: int64

In [130]:
pfocr_df[
    (pfocr_df["pfocr_id"].isin(weird_ap))
    & (pfocr_df["unique_gene_count"] == 17)
][["pfocr_id", "unique_gene_count", "entrez"]]

Unnamed: 0,figid,unique_gene_count,entrez


In [140]:
pfocr_df[pfocr_df["pfocr_id"] == "PMC3151466__nihms-315348-f0002.jpg"]

Unnamed: 0,figid,number,reftext,year,pathway_score,pmc_ranked_result_index,figtitle,papertitle,caption,figlink,...,automl_index,figurl,word,symbol,source,hgnc_symbol,entrez,nobe_count,entrez_count,unique_gene_count
64942,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F1,E2F1,hgnc_symbol,E2F1,1869,10,17,9
64943,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F,E2F,bioentities_symbol,E2F1,1869,10,17,9
64944,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F2,E2F2,hgnc_symbol,E2F2,1870,10,17,9
64945,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F,E2F,bioentities_symbol,E2F2,1870,10,17,9
64946,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F3,E2F3,hgnc_symbol,E2F3,1871,10,17,9
64947,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F,E2F,bioentities_symbol,E2F3,1871,10,17,9
64948,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F,E2F,bioentities_symbol,E2F4,1874,10,17,9
64949,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F4,E2F4,hgnc_symbol,E2F4,1874,10,17,9
64950,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F,E2F,bioentities_symbol,E2F5,1875,10,17,9
64951,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F5,E2F5,hgnc_symbol,E2F5,1875,10,17,9


In [125]:
pfocr_genes_df[pfocr_genes_df["pfocr_id"].isin(weird_ap)][
    ["pfocr_id", "entrez"]
].drop_duplicates().groupby("pfocr_id")["entrez"].count().max()

9

For GMT files, tabs and percent signs are special characters. Let's check whether we have any that require escaping.

No pfocr_ids or figtitles have tabs, and no pfocr_id has a percent sign.

In [173]:
for i, row in (
    pfocr_df[
        pfocr_df["figtitle"].str.contains("\t")
        | pfocr_df["pfocr_id"].str.contains("\t")
        | pfocr_df["pfocr_id"].str.contains("\%")
    ][["pfocr_id", "figtitle"]]
    .drop_duplicates()
    .iterrows()
):
    figtitle = row["figtitle"]
    pfocr_id = row["pfocr_id"]
    print(f"{pfocr_id} '{figtitle}'")

Some figtitles do have at least one percent sign.

In [174]:
for i, row in (
    pfocr_df[pfocr_df["figtitle"].str.contains("\%")][["pfocr_id", "figtitle"]]
    .drop_duplicates()
    .iterrows()
):
    figtitle = row["figtitle"]
    pfocr_id = row["pfocr_id"]
    print(pfocr_id)
    print(figtitle)
    print("")

PMC2703816__nihms100324f3.jpg
Functional annotation of 115 C/EBPŒ≤ target genes presented in pie chart format shows that 74% of target genes are dedicated to signaling, metabolism, and transport, with the remaining 26% having roles in transcription, synaptic transmission, differentiation and proliferation

PMC2730981__nihms137404f1.jpg
Activation of inflammatory pathway mediated through NF-Œ∫B by life-style related factors such as tobacco, stress, dietary agents, obesity, alcohol, infectious agents, irradiation and environmental stimuli that account for as much as 95% of all cancers

PMC2750869__nihms91210f5.jpg
Final elaboration of 8, the requisite cyclization precursor, entailed treatment of 2-butanone (21) with (‚àí)-DIPCl and Et3N according to conditions developed by Paterson,, followed by addition of aldehyde (+)-20; cyclization precursor (8) was obtained as an inseparable mixture of diastereomers (5:1) in 86% yield ()

PMC3188852__nihms196597f11.jpg
In order to construct the quat

In [152]:
pfocr_df[pfocr_df["pfocr_id"].str.contains("\t")]

Unnamed: 0,figid,number,reftext,year,pathway_score,pmc_ranked_result_index,figtitle,papertitle,caption,figlink,...,automl_index,figurl,word,symbol,source,hgnc_symbol,entrez,nobe_count,entrez_count,unique_gene_count


In [175]:
# A pfocr_id that rpy2 may be importing incorrectly. Check the encoding of figtitles.
pfocr_figures_df[
    pfocr_figures_df["pfocr_id"] == "PMC2703816__nihms100324f3.jpg"
]

Unnamed: 0,figid,number,reftext,year,pathway_score,pmc_ranked_result_index,figtitle,papertitle,caption,figlink,pmcid,filename,source_f,type.man,automl_index,figurl
61692,PMC2703816__nihms100324f3.jpg,Figure 3,"N. Kfoury, et al. Mol Cell Neurosci. ;40(3):31...",,0.708703,222034,Functional annotation of 115 C/EBPŒ≤ target ge...,Identification of neuronal target genes for CC...,Functional annotation of 115 C/EBPŒ≤ target ge...,/pmc/articles/PMC2703816/figure/F3/,PMC2703816,nihms100324f3.jpg,../data/images/PMC2703816__nihms100324f3.jpg,,58133,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...


In [176]:
import pandas as pd
import rpy2
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

pandas2ri.activate()
base = importr("base")
readRDS = robjects.r["readRDS"]

In [None]:
def rdf2pandas(df):
    df_names = base.names(df)
    data = dict()
    for i in range(len(df_names)):
        column = df[i]
        if hasattr(column, "iter_labels"):
            data[df_names[i]] = [x for x in column.iter_labels()]
        else:
            data[df_names[i]] = [x for x in column]

    return pd.DataFrame(data=data)


def rds2pandas(f):
    return rdf2pandas(readRDS(str(f)))

In [177]:
hmm_pfocr_figures_df = readRDS(
    "~/Dropbox (Gladstone)/Documents/PFOCR_25Years/pfocr_figures.rds"
)
hmm_pfocr_figures_df.head()

figid,pmcid,filename,...,number,caption,organism
'PMC5...,'PMC5...,'4159...,...,'Figu...,'Mode...,'Homo...
'PMC4...,'PMC4...,'zh20...,...,'Fig....,'Prop...,'Homo...
'PMC5...,'PMC5...,'rsob...,...,'Figu...,'Aβ p...,'Homo...
'PMC4...,'PMC4...,'pone...,...,'Figu...,'Gluc...,'Homo...
'PMC2...,'PMC2...,'nihm...,...,'Figu...,'Bact...,'Homo...
'PMC4...,'PMC4...,'emss...,...,'Figu...,'Sche...,'Homo...


In [184]:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr

In [185]:
r_df = ro.DataFrame(
    {
        "int_values": ro.IntVector([1, 2, 3]),
        "str_values": ro.StrVector(["abc", "def", "ghi"]),
    }
)

r_df

int_values,str_values
1,'abc'
2,'def'
3,'ghi'


Demo from [here](https://rpy2.github.io/doc/v3.4.x/html/generated_rst/pandas.html)

In [192]:
with localconverter(ro.default_converter + pandas2ri.converter):
    pd_from_r_df = ro.conversion.ri2py_dataframe(r_df)

pd_from_r_df

AttributeError: module 'rpy2.robjects.conversion' has no attribute 'ri2py_dataframe'

In [195]:
from rpy2.robjects import default_converter, pandas2ri
from rpy2.robjects.conversion import localconverter

with localconverter(default_converter + pandas2ri.converter) as cv:
    pd_from_r_df = pandas2ri.ri2py(r_df)
pd_from_r_df

ValueError: Buffer for this type not yet supported.

In [193]:
pandas2ri.ri2py_dataframe(r_df)

ValueError: Buffer for this type not yet supported.

In [188]:
rpy2.situation

<module 'rpy2.situation' from '/nix/store/hm475d8nhddsi2cvipkhl8k06cmxn7kh-python3-3.7.5-env/lib/python3.7/site-packages/rpy2/situation.py'>

In [189]:
ro.conversion.ri2py

<function rpy2.robjects.conversion._ri2py(obj)>

In [197]:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr

In [198]:
pd_df = pd.DataFrame(
    {"int_values": [1, 2, 3], "str_values": ["abc", "def", "ghi"]}
)

pd_df

Unnamed: 0,int_values,str_values
0,1,abc
1,2,def
2,3,ghi


In [199]:
with localconverter(ro.default_converter + pandas2ri.converter):
    r_from_pd_df = ro.conversion.py2rpy(pd_df)

r_from_pd_df

AttributeError: module 'rpy2.robjects.conversion' has no attribute 'py2rpy'

In [200]:
from functools import partial

from rpy2.ipython import html

html.html_rdataframe = partial(html.html_rdataframe, table_class="docutils")

ModuleNotFoundError: No module named 'simplegeneric'

In [35]:
# /run/user/1000/tmp_yz5_q44/pfocr20200224.gmt