In [47]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


# Export 2020-02-24 Results as GMT

In [48]:
from pprint import pprint
import json
import os
import tempfile
from pathlib import Path
import re
import sys

import numpy as np
import pandas as pd

import requests
import requests_cache

import rpy2

In [49]:
requests_cache.install_cache("pfocr_cache")

In [50]:
from rdf2pandas import rds2pandas

## Import Data 

In [51]:
# figures
pfocr_figures_rds_url = (
    "https://www.dropbox.com/s/qhc33zho78rnaoj/pfocr_figures.rds?dl=1"
)

with tempfile.NamedTemporaryFile(suffix=".rds") as f:
    pfocr_figures_rds_path = f.name
    with requests.get(pfocr_figures_rds_url, stream=True) as r:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
        f.seek(0)
    pfocr_figures_df = rds2pandas(pfocr_figures_rds_path)
pfocr_figures_df["figurl"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/"
    + pfocr_figures_df["pmcid"]
    + "/bin/"
    + pfocr_figures_df["filename"]
)
pfocr_figures_df

Unnamed: 0,figid,number,reftext,year,pathway_score,pmc_ranked_result_index,figtitle,papertitle,caption,figlink,pmcid,filename,source_f,type.man,automl_index,figurl
0,PMC5653847__41598_2017_14124_Fig8_HTML.jpg,Figure 8,"Céline Barthelemy, et al. Sci Rep. 2017;7:13816.",2017,0.968270,133303,Model of FTY720-induced transporter endocytosi...,FTY720-induced endocytosis of yeast and human ...,Model of FTY720-induced transporter endocytosi...,/pmc/articles/PMC5653847/figure/Fig8/,PMC5653847,41598_2017_14124_Fig8_HTML.jpg,../data/images/PMC5653847__41598_2017_14124_Fi...,,3012,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
1,PMC4187043__zh20191474070013.jpg,Fig. 13,"Yuan Wei, et al. Am J Physiol Renal Physiol. 2...",2014,0.965793,79929,Proposed signaling pathway by which the stimul...,Angiotensin II type 2 receptor regulates ROMK-...,Proposed signaling pathway by which the stimul...,/pmc/articles/PMC4187043/figure/F13/,PMC4187043,zh20191474070013.jpg,../data/images/PMC4187043__zh20191474070013.jpg,,4323,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
2,PMC5746550__rsob-7-170228-g1.jpg,Figure 1,"Georgia R. Frost, et al. Open Biol. 2017 Dec;7...",2017,0.962470,98034,AŒ≤ production,The role of astrocytes in amyloid production a...,AŒ≤ production. In the amyloidogenic pathway (...,/pmc/articles/PMC5746550/figure/RSOB170228F1/,PMC5746550,rsob-7-170228-g1.jpg,../data/images/PMC5746550__rsob-7-170228-g1.jpg,,6334,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
3,PMC4211692__pone.0110875.g008.jpg,Figure 8,"Enida Gjoni, et al. PLoS One. 2014;9(10):e110875.",2014,0.966721,142401,,Glucolipotoxicity Impairs Ceramide Flow from t...,Glucolipotoxicity impairs CERT- and vesicular-...,/pmc/articles/PMC4211692/figure/pone-0110875-g...,PMC4211692,pone.0110875.g008.jpg,../data/images/PMC4211692__pone.0110875.g008.jpg,,3808,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
4,PMC2588433__nihms78212f8.jpg,Figure 8,"Amanda L. Lewis, et al. J Biol Chem. ;282(38):...",,0.966758,67398,,NeuA sialic acid O-acetylesterase activity mod...,Bacterial Sia biosynthesis can be divided into...,/pmc/articles/PMC2588433/figure/F8/,PMC2588433,nihms78212f8.jpg,../data/images/PMC2588433__nihms78212f8.jpg,,3790,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64638,PMC4216988__zh20221474360006.jpg,Fig. 6,"Marcelo D. Carattino, et al. Am J Physiol Rena...",2014,0.143076,108774,Hypothetical mechanism of activation of ENaC b...,Prostasin interacts with the epithelial Na+ ch...,Hypothetical mechanism of activation of ENaC b...,/pmc/articles/PMC4216988/figure/F6/,PMC4216988,zh20221474360006.jpg,../data/images/PMC4216988__zh20221474360006.jpg,pathway,77324,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
64639,PMC2873070__nihms128887f5.jpg,Scheme 1,"Hua Cheng, et al. Neurobiol Aging. ;31(7):1188...",,0.127176,143547,A schematic diagram of a proposed working mode...,Apolipoprotein E mediates sulfatide depletion ...,A schematic diagram of a proposed working mode...,/pmc/articles/PMC2873070/figure/F5/,PMC2873070,nihms128887f5.jpg,../data/images/PMC2873070__nihms128887f5.jpg,pathway,78813,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
64640,PMC3651446__pnas.1220523110fig06.jpg,Fig. 6,"Jiun-Ming Wu, et al. Proc Natl Acad Sci U S A....",2013,0.055546,159643,Models for nucleation of centrosomal and kinet...,Aurora kinase inhibitors reveal mechanisms of ...,Models for nucleation of centrosomal and kinet...,/pmc/articles/PMC3651446/figure/fig06/,PMC3651446,pnas.1220523110fig06.jpg,../data/images/PMC3651446__pnas.1220523110fig0...,pathway,114977,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
64641,PMC6770832__cancers-11-01236-g005.jpg,Figure 5,"Carmel Mothersill, et al. Cancers (Basel). 201...",2019,0.140041,618,A simplified TGFŒ≤ pathway leading to p21 expr...,Relevance of Non-Targeted Effects for Radiothe...,A simplified TGFŒ≤ pathway leading to p21 expr...,/pmc/articles/PMC6770832/figure/cancers-11-012...,PMC6770832,cancers-11-01236-g005.jpg,../data/images/PMC6770832__cancers-11-01236-g0...,pathway,77606,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...


In [52]:
# genes
pfocr_genes_rds_url = "https://www.dropbox.com/s/alf7auvxve36oer/pfocr_genes.rds?dl=1"

with tempfile.NamedTemporaryFile(suffix=".rds") as f:
    pfocr_genes_rds_path = f.name
    with requests.get(pfocr_genes_rds_url, stream=True) as r:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
        f.seek(0)
    pfocr_genes_df = rds2pandas(pfocr_genes_rds_path)
pfocr_genes_df

Unnamed: 0,figid,pmcid,word,symbol,source,hgnc_symbol,entrez
0,PMC100003__mb2410470011.jpg,PMC100003,"Ga12,Gaq",G-ALPHA-q,hgnc_alias_symbol,GNAQ,2776
1,PMC100003__mb2410470011.jpg,PMC100003,Etk,ETK,hgnc_alias_symbol,BMX,660
2,PMC100003__mb2410470011.jpg,PMC100003,FAK,FAK,hgnc_alias_symbol,PTK2,5747
3,PMC100003__mb2410470011.jpg,PMC100003,AR*,AR,hgnc_symbol,AR,367
4,PMC100003__mb2410470011.jpg,PMC100003,(Src,SRC,hgnc_symbol,SRC,6714
...,...,...,...,...,...,...,...
1112546,PMC99976__mb2310138007.jpg,PMC99976,MEK-2,MEK2,hgnc_alias_symbol,MAP2K2,5605
1112547,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,HRAS,3265
1112548,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,KRAS,3845
1112549,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,NRAS,4893


In [53]:
# nobe_counts
pfocr_nobe_counts_rds_url = (
    "https://www.dropbox.com/s/pd8n5hs4vrvb4st/pfocr_nobe_counts.rds?dl=1"
)

with tempfile.NamedTemporaryFile(suffix=".rds") as f:
    pfocr_nobe_counts_rds_path = f.name
    with requests.get(pfocr_nobe_counts_rds_url, stream=True) as r:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
        f.seek(0)
    pfocr_nobe_counts_df = rds2pandas(pfocr_nobe_counts_rds_path)
pfocr_nobe_counts_df

Unnamed: 0,figid,nobe_count,entrez_count
0,PMC100003__mb2410470011.jpg,5,5
1,PMC100005__mb2410575011.jpg,6,16
2,PMC100008__mb2411709009.jpg,8,52
3,PMC101225__1475-2867-1-1-1.jpg,1,1
4,PMC101242__gkf20707.jpg,13,13
...,...,...,...
58957,PMC99889__mb2110211013.jpg,12,22
58958,PMC99903__mb2110106008.jpg,1,1
58959,PMC99957__mb2310813001.jpg,10,10
58960,PMC99957__mb2310813011.jpg,2,2


In [54]:
# merge the dfs to get pfocr analysis set
pfocr_df = (
    (
        pfocr_figures_df.join(
            pfocr_genes_df.drop(columns=["pmcid"]).set_index("figid"),
            on="figid",
            how="inner",
        ).join(
            pfocr_nobe_counts_df.set_index("figid"), on="figid", how="inner",
        )
    )
    .sort_values(["year", "pmcid", "figid", "entrez"])
    .reset_index(drop=True)
)
pfocr_df

Unnamed: 0,figid,number,reftext,year,pathway_score,pmc_ranked_result_index,figtitle,papertitle,caption,figlink,...,type.man,automl_index,figurl,word,symbol,source,hgnc_symbol,entrez,nobe_count,entrez_count
0,PMC1201394__nihms3677f4.jpg,Fig. 4,"Marissa J. Carstens, et al. J Biol Chem. ;279(...",,0.894122,38017,,Cell Cycle Arrest and Cell Death Are Controlle...,"A, Mdm2 is negatively regulated by p19Arf. As ...",/pmc/articles/PMC1201394/figure/F4/,...,,39679,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,(Cdkn2a,CDKN2A,hgnc_symbol,CDKN2A,1029,6,7
1,PMC1201394__nihms3677f4.jpg,Fig. 4,"Marissa J. Carstens, et al. J Biol Chem. ;279(...",,0.894122,38017,,Cell Cycle Arrest and Cell Death Are Controlle...,"A, Mdm2 is negatively regulated by p19Arf. As ...",/pmc/articles/PMC1201394/figure/F4/,...,,39679,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,p19/Arf,ARF,hgnc_alias_symbol,CDKN2A,1029,6,7
2,PMC1201394__nihms3677f4.jpg,Fig. 4,"Marissa J. Carstens, et al. J Biol Chem. ;279(...",,0.894122,38017,,Cell Cycle Arrest and Cell Death Are Controlle...,"A, Mdm2 is negatively regulated by p19Arf. As ...",/pmc/articles/PMC1201394/figure/F4/,...,,39679,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,G1/S,Gs,bioentities_symbol,GNAL,2774,6,7
3,PMC1201394__nihms3677f4.jpg,Fig. 4,"Marissa J. Carstens, et al. J Biol Chem. ;279(...",,0.894122,38017,,Cell Cycle Arrest and Cell Death Are Controlle...,"A, Mdm2 is negatively regulated by p19Arf. As ...",/pmc/articles/PMC1201394/figure/F4/,...,,39679,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,G1/S,Gs,bioentities_symbol,GNAS,2778,6,7
4,PMC1201394__nihms3677f4.jpg,Fig. 4,"Marissa J. Carstens, et al. J Biol Chem. ;279(...",,0.894122,38017,,Cell Cycle Arrest and Cell Death Are Controlle...,"A, Mdm2 is negatively regulated by p19Arf. As ...",/pmc/articles/PMC1201394/figure/F4/,...,,39679,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,Mdm2,MDM2,hgnc_symbol,MDM2,4193,6,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1112546,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,TRIM41,TRIM41,hgnc_symbol,TRIM41,90933,24,28
1112547,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,RNF185,RNF185,hgnc_symbol,RNF185,91445,24,28
1112548,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,CGAS,CGAS,hgnc_symbol,CGAS,115004,24,28
1112549,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,TTLL6,TTLL6,hgnc_symbol,TTLL6,284076,24,28


In [55]:
gene_counts_df = (
    pfocr_df[["figid", "entrez"]]
    .drop_duplicates()
    .groupby("figid")
    .count()
    .rename(columns={"entrez": "unique_gene_count"})
)
print(len(gene_counts_df))
pfocr_df = pfocr_df.join(gene_counts_df, on="figid")

58962


Limit to figures having 10 or more unique genes.

In [62]:
cutoff_pfocr_df = pfocr_df[pfocr_df["unique_gene_count"] >= 10][
    ["figid", "figtitle", "entrez"]
].drop_duplicates()
cutoff_pfocr_df

Unnamed: 0,figid,figtitle,entrez
16,PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,2774
17,PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,2778
18,PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,5330
19,PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,5331
20,PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,5332
...,...,...,...
1112546,PMC6906418__12276_2019_299_Fig2_HTML.jpg,,90933
1112547,PMC6906418__12276_2019_299_Fig2_HTML.jpg,,91445
1112548,PMC6906418__12276_2019_299_Fig2_HTML.jpg,,115004
1112549,PMC6906418__12276_2019_299_Fig2_HTML.jpg,,284076


In [63]:
print(len(cutoff_pfocr_df["figid"].unique()))
print(len(cutoff_pfocr_df["figtitle"].unique()))
print(len(cutoff_pfocr_df["entrez"].unique()))

32277
23450
13153


## Export GMT

[GMT format description](https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29).

Sample of what we want:
```
PMC100005__mb2410575011.jpg     Formation of TrkA–src-aPKC complexes    5293    5294    25759   6714    8503    4914    10818   399694  30849   7189      23533   5290    6464    146850  53358   5291
```

TODO: there are some characters that could mess up the parsing of a GMT. For example, a `%` or a `\t` would be parsed as a delimiter. Also, if a doublequote `"` can be used to enclose fields with characters like spaces, then a title containing a doublequote would mess up a parser.

In those cases, do we want to escape the special characters or remove them?

In [12]:
def genes_to_gmt(df):
    return df["entrez"].astype("str").str.cat(sep="\t")

In [84]:
gmt_df = (
    cutoff_pfocr_df[["figid", "figtitle"]]
    .drop_duplicates()
    .set_index("figid")
    .copy(deep=True)
)
gmt_df["genes"] = (
    cutoff_pfocr_df.set_index("figid")
    .groupby("figid")
    .apply(lambda x: x["entrez"].astype("str").to_list())
)
gmt_df["genes_str"] = (
    cutoff_pfocr_df.set_index("figid").groupby("figid").apply(genes_to_gmt)
)
gmt_df = gmt_df.fillna("")

gmt_df

Unnamed: 0_level_0,figtitle,genes,genes_str
figid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PMC1249490__nihms5296f3.jpg,The primary targets of lithium in the cell are...,"[2774, 2778, 5330, 5331, 5332, 5333, 5335, 533...",2774\t2778\t5330\t5331\t5332\t5333\t5335\t5336...
PMC1307498__nihms5498f3.jpg,Schematic model of insulin-stimulated glucose ...,"[207, 208, 3667, 5163, 5164, 5165, 5166, 5290,...",207\t208\t3667\t5163\t5164\t5165\t5166\t5290\t...
PMC1307511__nihms2079f3.jpg,,"[317, 834, 835, 836, 837, 838, 839, 840, 841, ...",317\t834\t835\t836\t837\t838\t839\t840\t841\t8...
PMC1351030__nihms2404f8.jpg,A model of signaling pathway from oncogenic H-...,"[595, 688, 1432, 1958, 3265, 5594, 5595, 5599,...",595\t688\t1432\t1958\t3265\t5594\t5595\t5599\t...
PMC1352153__nihms-7536-0007.jpg,Model for GalR1 neuroplasticity within the LC,"[466, 1385, 1386, 1390, 2587, 4236, 5566, 5567...",466\t1385\t1386\t1390\t2587\t4236\t5566\t5567\...
...,...,...,...
PMC6904862__WJSC-11-1104-g003.jpg,Biomechanical stress stimulates the mechanosen...,"[207, 208, 1432, 5594, 5595, 5599, 5600, 5601,...",207\t208\t1432\t5594\t5595\t5599\t5600\t5601\t...
PMC6904864__WJSC-11-1084-g001.jpg,Small molecules guiding mesenchymal stem cell ...,"[207, 208, 649, 650, 651, 652, 653, 654, 655, ...",207\t208\t649\t650\t651\t652\t653\t654\t655\t6...
PMC6905007__40246_2019_252_Fig2_HTML.jpg,LRF/ZBTB7A‚Äôs silencing compromises Warburg e...,"[355, 596, 834, 835, 836, 837, 838, 839, 840, ...",355\t596\t834\t835\t836\t837\t838\t839\t840\t8...
PMC6906418__12276_2019_299_Fig1_HTML.jpg,,"[834, 835, 836, 837, 838, 839, 840, 841, 842, ...",834\t835\t836\t837\t838\t839\t840\t841\t842\t8...


Let's export the GMT file. The paths match AR's system, but you'll need to update them if yours aren't the same.

In [85]:
dropbox_dir = Path("~/Dropbox (Gladstone)").expanduser().resolve()
paper_dir = dropbox_dir.joinpath("Documents/PFOCR_25Years").expanduser().resolve()
data_dir = paper_dir
# data_dir = paper_dir.joinpath("raw")

if not data_dir.exists():
    tmp_dir = tempfile.TemporaryDirectory()
    # can explicitly remove tmp_dir with tmp_dir.cleanup()
    data_dir = Path(tmp_dir.name)

gmt_path = data_dir.joinpath("pfocr_curated.gmt")

with open(gmt_path, "w") as gmt_f:
    for figid, row in gmt_df.iterrows():
        gmt_f.write("\t".join([figid, row["figtitle"], row["genes_str"],]))
        gmt_f.write("\n")

print(f"Output saved: {gmt_path}")

Output saved: /home/ariutta/Dropbox (Gladstone)/Documents/PFOCR_25Years/pfocr_curated.gmt


In [86]:
ar_gmt_df = gmt_df.reset_index()

In [87]:
ap_gmt_path = paper_dir.joinpath("raw/pfocr_curated_ap.gmt")
ap_gmt_data = list()
with open(ap_gmt_path, "r") as gmt_f:
    for line in gmt_f:
        chunks = line.strip().split("\t")
        figid = chunks.pop(0)
        figtitle = chunks.pop(0)
        genes = chunks
        ap_gmt_data.append({"figid": figid, "figtitle": figtitle, "genes": genes})
ap_gmt_df = pd.DataFrame.from_records(ap_gmt_data)
ap_gmt_df

Unnamed: 0,figid,figtitle,genes
0,PMC100005__mb2410575011.jpg,Formation of TrkA–src-aPKC complexes,"[5293, 5294, 25759, 6714, 8503, 4914, 10818, 3..."
1,PMC100008__mb2411709009.jpg,Cooperation between Beta-AR and B2R signaling ...,"[5331, 5576, 51196, 23236, 5567, 5566, 107, 11..."
2,PMC101242__gkf20707.jpg,Up-regulation of RAD51 in 8-MOP-treated cells,"[5883, 5980, 11200, 5893, 10111, 5888, 546, 47..."
3,PMC101600__ac1100359003.jpg,Hypothetical biosynthetic pathway of coumermyc...,"[125965, 84701, 1327, 10063, 1352, 1353, 1351,..."
4,PMC102186__mb2400926009.jpg,Ras-mediated signaling pathways,"[6300, 5602, 6300, 5879, 5058, 5062, 57144, 58..."
...,...,...,...
32274,PMC99537__jb1610128005.jpg,Basic metabolic pathways in C,"[23038, 4236, 10603, 788, 5885, 9126, 6576, 12..."
32275,PMC99857__mb2010895009.jpg,"The B(a)P-induced apoptotic signaling pathway,...","[9459, 64374, 837, 100506742, 23581, 834, 835,..."
32276,PMC99889__mb2110211013.jpg,Erk activation following TCR stimulation,"[5594, 5595, 2885, 7535, 5604, 27040, 5327, 39..."
32277,PMC99957__mb2310813001.jpg,Purine metabolism in S,"[23038, 4236, 355, 22978, 5471, 2987, 1015, 83..."


The results from AP and from AR are not identical matches.

In [148]:
weird_ap = set(ap_gmt_df["figid"]) - set(ar_gmt_df["figid"])
print(len(weird_ap))
weird_ar = set(ar_gmt_df["figid"]) - set(ap_gmt_df["figid"])
print(len(weird_ar))

311
309


Sample of results in AP but not in AR.

In [89]:
ap_gmt_df[ap_gmt_df["figid"].isin(weird_ap)].head()

Unnamed: 0,figid,figtitle,genes
15,PMC1069556__zjv0080560770006.jpg,Effect of NS5A on the Beta-catenin signaling p...,"[7008, 2932, 572, 9733, 1499, 2308, 10000, 207..."
19,PMC107855__ii0180505007.jpg,Influence of OdDHL on the immune system,"[3592, 3576, 3458, 3664, 3553, 3593, 3439, 712..."
263,PMC140507__gkg141f1.jpg,The PI 3-kinase pathway by RNAi,"[5601, 5599, 207, 5602, 5601, 5163, 5599, 5728..."
290,PMC1435660__75f8c.jpg,Hypothetical model for HHT,"[7043, 94, 7046, 94, 7042, 2022, 51175, 7046, ..."
405,PMC1544360__nihms10600f7.jpg,Simplified repression model of angptl3 promote...,"[6256, 9612, 6257, 6258, 10062, 9612, 6256, 87..."


Sample of results in AR but not in AP.

In [90]:
ar_gmt_df[ar_gmt_df["figid"].isin(weird_ar)].head()

Unnamed: 0,figid,figtitle,genes,genes_str
151,PMC2254938__nihms-38751-f0001.jpg,,"[207, 208, 3265, 3667, 3845, 4893, 8471, 8660,...",207\t208\t3265\t3667\t3845\t4893\t8471\t8660\t...
194,PMC2398704__nihms-47534-f0003.jpg,Variation the insulin/insulin-like (IIS) pathw...,"[207, 948, 3479, 3667, 5163, 5164, 5165, 5166,...",207\t948\t3479\t3667\t5163\t5164\t5165\t5166\t...
241,PMC2529154__nihms62767f4.jpg,Proposed biosynthetic pathway for MDP chromoph...,"[712, 713, 714, 715, 716, 717, 718, 727, 729, ...",712\t713\t714\t715\t716\t717\t718\t727\t729\t7...
300,PMC2585776__nihms70500f3.jpg,,"[1432, 2353, 5594, 5595, 5599, 5600, 5601, 560...",1432\t2353\t5594\t5595\t5599\t5600\t5601\t5602...
301,PMC2585776__nihms70500f5.jpg,,"[1432, 2353, 5594, 5595, 5599, 5600, 5601, 560...",1432\t2353\t5594\t5595\t5599\t5600\t5601\t5602...


In [91]:
ap_gmt_df[ap_gmt_df["figid"].isin(weird_ap)]["genes"].apply(len).max()
ap_gmt_df["genes"].apply(len).max()

385

The results from AP include some cases where entrez ids are duplicated.

In [93]:
ap_gmt_df["unique_genes"] = ap_gmt_df["genes"].apply(lambda x: list(set(x)))
ap_gmt_df[ap_gmt_df["genes"].apply(len) != ap_gmt_df["unique_genes"].apply(len)].head()

Unnamed: 0,figid,figtitle,genes,unique_genes
2,PMC101242__gkf20707.jpg,Up-regulation of RAD51 in 8-MOP-treated cells,"[5883, 5980, 11200, 5893, 10111, 5888, 546, 47...","[2521, 546, 545, 10111, 5980, 5893, 472, 5883,..."
4,PMC102186__mb2400926009.jpg,Ras-mediated signaling pathways,"[6300, 5602, 6300, 5879, 5058, 5062, 57144, 58...","[391, 5062, 5605, 3845, 10298, 56924, 3725, 56..."
5,PMC104390__mb0691873009.jpg,Proposed model for translational homeostasis i...,"[5293, 8503, 2475, 5291, 5464, 30849, 146850, ...","[5290, 208, 8503, 5464, 23533, 207, 5293, 5291..."
9,PMC1052008__JCI0524178.f2.jpg,Gq/11-activated pathways in maladaptive hypert...,"[5331, 5534, 5332, 5579, 23683, 5587, 5533, 55...","[5590, 5330, 5583, 5579, 5532, 5578, 23683, 55..."
12,PMC106275__am0681773006.jpg,Analysis of the region between styR and styA,"[55811, 196883, 107, 108, 112, 115, 114, 113, ...","[113, 90527, 196883, 107, 108, 55811, 112, 263..."


In [94]:
sorted(
    ap_gmt_df[ap_gmt_df["genes"].apply(len) != ap_gmt_df["unique_genes"].apply(len)][
        "genes"
    ].iat[0]
)

['10111',
 '11200',
 '2521',
 '472',
 '545',
 '546',
 '5883',
 '5884',
 '5884',
 '5888',
 '5893',
 '5980',
 '7376']

The following should have no rows, because the results from AR shouldn't have any duplicates:

In [96]:
ar_gmt_df["unique_genes"] = ar_gmt_df["genes"].apply(lambda x: list(set(x)))
ar_gmt_df[ar_gmt_df["genes"].apply(len) != ar_gmt_df["unique_genes"].apply(len)]

Unnamed: 0,figid,figtitle,genes,genes_str,unique_genes


There are `309` hits in AR that are missing from AP, e.g., the figure with figid `PMC4929164__294_2016_565_Fig2_HTML.jpg`.

In [103]:
len(weird_ar)

309

In [102]:
list(weird_ar)[0]

'PMC4929164__294_2016_565_Fig2_HTML.jpg'

In [99]:
pfocr_genes_df[pfocr_genes_df["figid"] == list(weird_ar)[0]].head()

Unnamed: 0,figid,pmcid,word,symbol,source,hgnc_symbol,entrez
661217,PMC4929164__294_2016_565_Fig2_HTML.jpg,PMC4929164,(CDK),CDK,bioentities_symbol,CDK11A,728642
661218,PMC4929164__294_2016_565_Fig2_HTML.jpg,PMC4929164,(CDK),CDK,bioentities_symbol,CDK8,1024
661219,PMC4929164__294_2016_565_Fig2_HTML.jpg,PMC4929164,(CDK),CDK,bioentities_symbol,CDK7,1022
661220,PMC4929164__294_2016_565_Fig2_HTML.jpg,PMC4929164,(CDK),CDK,bioentities_symbol,CDK3,1018
661221,PMC4929164__294_2016_565_Fig2_HTML.jpg,PMC4929164,(CDK),CDK,bioentities_symbol,CDKL5,6792


The results from AP don't even contain anything for the paper:

In [109]:
ap_gmt_df[ap_gmt_df["figid"].str.contains("PMC4929164", regex=False)]

Unnamed: 0,figid,figtitle,genes,unique_genes


There are `311` hits in AP that are missing from AR, e.g., the figure with figid `PMC5752509__oncotarget-08-109135-g007.jpg`.

In [106]:
len(weird_ap)

311

In [107]:
list(weird_ap)[0]

'PMC5752509__oncotarget-08-109135-g007.jpg'

In [113]:
print(len(pfocr_genes_df[pfocr_genes_df["figid"] == list(weird_ap)[0]]))
pfocr_genes_df[pfocr_genes_df["figid"] == list(weird_ap)[0]]

10


Unnamed: 0,figid,pmcid,word,symbol,source,hgnc_symbol,entrez
847113,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,RAR,RAR,bioentities_symbol,RARB,5915
847114,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,RAR,RAR,bioentities_symbol,RARG,5916
847115,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,P38,p38,bioentities_symbol,MAPK13,5603
847116,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,P38,p38,bioentities_symbol,MAPK12,6300
847117,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,P38,p38,bioentities_symbol,MAPK14,1432
847118,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,P38,p38,bioentities_symbol,MAPK11,5600
847119,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,RARa,RARA,hgnc_symbol,RARA,5914
847120,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,Src,SRC,hgnc_symbol,SRC,6714
847121,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,RAR,RAR,bioentities_symbol,RARA,5914
847122,PMC5752509__oncotarget-08-109135-g007.jpg,PMC5752509,GPR40,GPR40,hgnc_prev_symbol,FFAR1,2864


The results from AR don't even contain anything for the paper:

In [110]:
ar_gmt_df[ar_gmt_df["figid"].str.contains("PMC5752509", regex=False)]

Unnamed: 0,figid,figtitle,genes,genes_str,unique_genes


In [114]:
print(len(pfocr_genes_df[pfocr_genes_df["figid"].isin(weird_ap)]))
pfocr_genes_df[pfocr_genes_df["figid"].isin(weird_ap)]

3304


Unnamed: 0,figid,pmcid,word,symbol,source,hgnc_symbol,entrez
452,PMC1069556__zjv0080560770006.jpg,PMC1069556,Tef,TEF,hgnc_symbol,TEF,7008
453,PMC1069556__zjv0080560770006.jpg,PMC1069556,GSK-3B,GSK3B,hgnc_symbol,GSK3B,2932
454,PMC1069556__zjv0080560770006.jpg,PMC1069556,Bad,BAD,hgnc_symbol,BAD,572
455,PMC1069556__zjv0080560770006.jpg,PMC1069556,p110,p110,hgnc_alias_symbol,SART3,9733
456,PMC1069556__zjv0080560770006.jpg,PMC1069556,Bcatenin,BCATENIN,bioentities_symbol,CTNNB1,1499
...,...,...,...,...,...,...,...
1112145,PMC98722__ii1010637004.jpg,PMC98722,actin,Actin,bioentities_symbol,ACTG1,71
1112146,PMC98722__ii1010637004.jpg,PMC98722,actin,Actin,bioentities_symbol,ACTG2,72
1112147,PMC98722__ii1010637004.jpg,PMC98722,Cdc42,CDC42,hgnc_symbol,CDC42,998
1112148,PMC98722__ii1010637004.jpg,PMC98722,N-WASP,N-WASP,hgnc_alias_symbol,WASL,8976


In [119]:
pfocr_genes_df

Unnamed: 0,figid,pmcid,word,symbol,source,hgnc_symbol,entrez
0,PMC100003__mb2410470011.jpg,PMC100003,"Ga12,Gaq",G-ALPHA-q,hgnc_alias_symbol,GNAQ,2776
1,PMC100003__mb2410470011.jpg,PMC100003,Etk,ETK,hgnc_alias_symbol,BMX,660
2,PMC100003__mb2410470011.jpg,PMC100003,FAK,FAK,hgnc_alias_symbol,PTK2,5747
3,PMC100003__mb2410470011.jpg,PMC100003,AR*,AR,hgnc_symbol,AR,367
4,PMC100003__mb2410470011.jpg,PMC100003,(Src,SRC,hgnc_symbol,SRC,6714
...,...,...,...,...,...,...,...
1112546,PMC99976__mb2310138007.jpg,PMC99976,MEK-2,MEK2,hgnc_alias_symbol,MAP2K2,5605
1112547,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,HRAS,3265
1112548,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,KRAS,3845
1112549,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,NRAS,4893


In [124]:
pfocr_df[pfocr_df["figid"].isin(weird_ap)]["unique_gene_count"].max()

9

In [133]:
pfocr_genes_df[pfocr_genes_df["figid"].isin(weird_ap)].groupby("figid")[
    "entrez"
].count().max()

17

In [138]:
counts_df = (
    pfocr_genes_df[pfocr_genes_df["figid"].isin(weird_ap)]
    .groupby("figid")["entrez"]
    .count()
)
counts_df.sort_values()
# counts_df.max()
# counts_df[counts_df["entrez"] == 17]

figid
PMC1069556__zjv0080560770006.jpg              10
PMC4800799__zmb9991011760007.jpg              10
PMC4815360__pr.115.011833f3.jpg               10
PMC4820813__kcll-05-04-1136374-g005.jpg       10
PMC4825016__zbc0171641000007.jpg              10
                                              ..
PMC5735158__41598_2017_16627_Fig6_HTML.jpg    15
PMC5915505__441_2018_2801_Fig2_HTML.jpg       15
PMC6272071__IJBMS-21-911-g007.jpg             15
PMC5045067__hp-2-197Fig1.jpg                  15
PMC3151466__nihms-315348-f0002.jpg            17
Name: entrez, Length: 311, dtype: int64

In [131]:
pfocr_df[pfocr_df["figid"].isin(weird_ap)][["unique_gene_count"]].max()

unique_gene_count    9
dtype: int64

In [130]:
pfocr_df[(pfocr_df["figid"].isin(weird_ap)) & (pfocr_df["unique_gene_count"] == 17)][
    ["figid", "unique_gene_count", "entrez"]
]

Unnamed: 0,figid,unique_gene_count,entrez


In [140]:
pfocr_df[pfocr_df["figid"] == "PMC3151466__nihms-315348-f0002.jpg"]

Unnamed: 0,figid,number,reftext,year,pathway_score,pmc_ranked_result_index,figtitle,papertitle,caption,figlink,...,automl_index,figurl,word,symbol,source,hgnc_symbol,entrez,nobe_count,entrez_count,unique_gene_count
64942,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F1,E2F1,hgnc_symbol,E2F1,1869,10,17,9
64943,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F,E2F,bioentities_symbol,E2F1,1869,10,17,9
64944,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F2,E2F2,hgnc_symbol,E2F2,1870,10,17,9
64945,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F,E2F,bioentities_symbol,E2F2,1870,10,17,9
64946,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F3,E2F3,hgnc_symbol,E2F3,1871,10,17,9
64947,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F,E2F,bioentities_symbol,E2F3,1871,10,17,9
64948,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F,E2F,bioentities_symbol,E2F4,1874,10,17,9
64949,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F4,E2F4,hgnc_symbol,E2F4,1874,10,17,9
64950,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F,E2F,bioentities_symbol,E2F5,1875,10,17,9
64951,PMC3151466__nihms-315348-f0002.jpg,Fig. (2),"Wei Du, et al. Curr Drug Targets. ;10(7):581-589.",,0.910064,190619,,The Rb Pathway and Cancer Therapeutics.,In mammals the E2F family is composed of eight...,/pmc/articles/PMC3151466/figure/F2/,...,35244,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,E2F5,E2F5,hgnc_symbol,E2F5,1875,10,17,9


In [125]:
pfocr_genes_df[pfocr_genes_df["figid"].isin(weird_ap)][
    ["figid", "entrez"]
].drop_duplicates().groupby("figid")["entrez"].count().max()

9

For GMT files, tabs and percent signs are special characters. Let's check whether we have any that require escaping.

No figids or figtitles have tabs, and no figid has a percent sign.

In [173]:
for i, row in (
    pfocr_df[
        pfocr_df["figtitle"].str.contains("\t")
        | pfocr_df["figid"].str.contains("\t")
        | pfocr_df["figid"].str.contains("\%")
    ][["figid", "figtitle"]]
    .drop_duplicates()
    .iterrows()
):
    figtitle = row["figtitle"]
    figid = row["figid"]
    print(f"{figid} '{figtitle}'")

Some figtitles do have at least one percent sign.

In [174]:
for i, row in (
    pfocr_df[pfocr_df["figtitle"].str.contains("\%")][["figid", "figtitle"]]
    .drop_duplicates()
    .iterrows()
):
    figtitle = row["figtitle"]
    figid = row["figid"]
    print(figid)
    print(figtitle)
    print("")

PMC2703816__nihms100324f3.jpg
Functional annotation of 115 C/EBPŒ≤ target genes presented in pie chart format shows that 74% of target genes are dedicated to signaling, metabolism, and transport, with the remaining 26% having roles in transcription, synaptic transmission, differentiation and proliferation

PMC2730981__nihms137404f1.jpg
Activation of inflammatory pathway mediated through NF-Œ∫B by life-style related factors such as tobacco, stress, dietary agents, obesity, alcohol, infectious agents, irradiation and environmental stimuli that account for as much as 95% of all cancers

PMC2750869__nihms91210f5.jpg
Final elaboration of 8, the requisite cyclization precursor, entailed treatment of 2-butanone (21) with (‚àí)-DIPCl and Et3N according to conditions developed by Paterson,, followed by addition of aldehyde (+)-20; cyclization precursor (8) was obtained as an inseparable mixture of diastereomers (5:1) in 86% yield ()

PMC3188852__nihms196597f11.jpg
In order to construct the quat

In [152]:
pfocr_df[pfocr_df["figid"].str.contains("\t")]

Unnamed: 0,figid,number,reftext,year,pathway_score,pmc_ranked_result_index,figtitle,papertitle,caption,figlink,...,automl_index,figurl,word,symbol,source,hgnc_symbol,entrez,nobe_count,entrez_count,unique_gene_count


In [175]:
# A figid that rpy2 may be importing incorrectly. Check the encoding of figtitles.
pfocr_figures_df[pfocr_figures_df["figid"] == "PMC2703816__nihms100324f3.jpg"]

Unnamed: 0,figid,number,reftext,year,pathway_score,pmc_ranked_result_index,figtitle,papertitle,caption,figlink,pmcid,filename,source_f,type.man,automl_index,figurl
61692,PMC2703816__nihms100324f3.jpg,Figure 3,"N. Kfoury, et al. Mol Cell Neurosci. ;40(3):31...",,0.708703,222034,Functional annotation of 115 C/EBPŒ≤ target ge...,Identification of neuronal target genes for CC...,Functional annotation of 115 C/EBPŒ≤ target ge...,/pmc/articles/PMC2703816/figure/F3/,PMC2703816,nihms100324f3.jpg,../data/images/PMC2703816__nihms100324f3.jpg,,58133,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...


In [176]:
import pandas as pd
import rpy2

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

pandas2ri.activate()
base = importr("base")
readRDS = robjects.r["readRDS"]

In [None]:
def rdf2pandas(df):
    df_names = base.names(df)
    data = dict()
    for i in range(len(df_names)):
        column = df[i]
        if hasattr(column, "iter_labels"):
            data[df_names[i]] = [x for x in column.iter_labels()]
        else:
            data[df_names[i]] = [x for x in column]

    return pd.DataFrame(data=data)


def rds2pandas(f):
    return rdf2pandas(readRDS(str(f)))

In [177]:
hmm_pfocr_figures_df = readRDS(
    "~/Dropbox (Gladstone)/Documents/PFOCR_25Years/pfocr_figures.rds"
)
hmm_pfocr_figures_df.head()

figid,pmcid,filename,...,number,caption,organism
'PMC5...,'PMC5...,'4159...,...,'Figu...,'Mode...,'Homo...
'PMC4...,'PMC4...,'zh20...,...,'Fig....,'Prop...,'Homo...
'PMC5...,'PMC5...,'rsob...,...,'Figu...,'Aβ p...,'Homo...
'PMC4...,'PMC4...,'pone...,...,'Figu...,'Gluc...,'Homo...
'PMC2...,'PMC2...,'nihm...,...,'Figu...,'Bact...,'Homo...
'PMC4...,'PMC4...,'emss...,...,'Figu...,'Sche...,'Homo...


In [184]:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

from rpy2.robjects.conversion import localconverter

In [185]:
r_df = ro.DataFrame(
    {
        "int_values": ro.IntVector([1, 2, 3]),
        "str_values": ro.StrVector(["abc", "def", "ghi"]),
    }
)

r_df

int_values,str_values
1,'abc'
2,'def'
3,'ghi'


Demo from [here](https://rpy2.github.io/doc/v3.4.x/html/generated_rst/pandas.html)

In [192]:
with localconverter(ro.default_converter + pandas2ri.converter):
    pd_from_r_df = ro.conversion.ri2py_dataframe(r_df)

pd_from_r_df

AttributeError: module 'rpy2.robjects.conversion' has no attribute 'ri2py_dataframe'

In [195]:
from rpy2.robjects import pandas2ri
from rpy2.robjects import default_converter
from rpy2.robjects.conversion import localconverter

with localconverter(default_converter + pandas2ri.converter) as cv:
    pd_from_r_df = pandas2ri.ri2py(r_df)
pd_from_r_df

ValueError: Buffer for this type not yet supported.

In [193]:
pandas2ri.ri2py_dataframe(r_df)

ValueError: Buffer for this type not yet supported.

In [188]:
rpy2.situation

<module 'rpy2.situation' from '/nix/store/hm475d8nhddsi2cvipkhl8k06cmxn7kh-python3-3.7.5-env/lib/python3.7/site-packages/rpy2/situation.py'>

In [189]:
ro.conversion.ri2py

<function rpy2.robjects.conversion._ri2py(obj)>

In [197]:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

from rpy2.robjects.conversion import localconverter

In [198]:
pd_df = pd.DataFrame({"int_values": [1, 2, 3], "str_values": ["abc", "def", "ghi"]})

pd_df

Unnamed: 0,int_values,str_values
0,1,abc
1,2,def
2,3,ghi


In [199]:
with localconverter(ro.default_converter + pandas2ri.converter):
    r_from_pd_df = ro.conversion.py2rpy(pd_df)

r_from_pd_df

AttributeError: module 'rpy2.robjects.conversion' has no attribute 'py2rpy'

In [200]:
from functools import partial
from rpy2.ipython import html
html.html_rdataframe=partial(html.html_rdataframe, table_class="docutils")

ModuleNotFoundError: No module named 'simplegeneric'

In [35]:
# /run/user/1000/tmp_yz5_q44/pfocr20200224.gmt