In [1]:
%load_ext lab_black

# Export 2020-02-24 Results as GMT

In [2]:
from pprint import pprint
import json
import os
import tempfile
from pathlib import Path
import re
import sys

import numpy as np
import pandas as pd

import requests
import requests_cache

import rpy2

In [3]:
requests_cache.install_cache("pfocr_cache")

In [4]:
from rdf2pandas import rds2pandas

## Import Data 

In [5]:
# figures
pfocr_figures_rds_url = (
    "https://www.dropbox.com/s/qhc33zho78rnaoj/pfocr_figures.rds?dl=1"
)

with tempfile.NamedTemporaryFile(suffix=".rds") as f:
    pfocr_figures_rds_path = f.name
    with requests.get(pfocr_figures_rds_url, stream=True) as r:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
        f.seek(0)
    pfocr_figures_df = rds2pandas(pfocr_figures_rds_path)
pfocr_figures_df["figurl"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/"
    + pfocr_figures_df["pmcid"]
    + "/bin/"
    + pfocr_figures_df["filename"]
)
pfocr_figures_df

Unnamed: 0,figid,number,reftext,year,pathway_score,pmc_ranked_result_index,figtitle,papertitle,caption,figlink,pmcid,filename,source_f,type.man,automl_index,figurl
0,PMC5653847__41598_2017_14124_Fig8_HTML.jpg,Figure 8,"Céline Barthelemy, et al. Sci Rep. 2017;7:13816.",2017,0.968270,133303,Model of FTY720-induced transporter endocytosi...,FTY720-induced endocytosis of yeast and human ...,Model of FTY720-induced transporter endocytosi...,/pmc/articles/PMC5653847/figure/Fig8/,PMC5653847,41598_2017_14124_Fig8_HTML.jpg,../data/images/PMC5653847__41598_2017_14124_Fi...,,3012,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
1,PMC4187043__zh20191474070013.jpg,Fig. 13,"Yuan Wei, et al. Am J Physiol Renal Physiol. 2...",2014,0.965793,79929,Proposed signaling pathway by which the stimul...,Angiotensin II type 2 receptor regulates ROMK-...,Proposed signaling pathway by which the stimul...,/pmc/articles/PMC4187043/figure/F13/,PMC4187043,zh20191474070013.jpg,../data/images/PMC4187043__zh20191474070013.jpg,,4323,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
2,PMC5746550__rsob-7-170228-g1.jpg,Figure 1,"Georgia R. Frost, et al. Open Biol. 2017 Dec;7...",2017,0.962470,98034,AŒ≤ production,The role of astrocytes in amyloid production a...,AŒ≤ production. In the amyloidogenic pathway (...,/pmc/articles/PMC5746550/figure/RSOB170228F1/,PMC5746550,rsob-7-170228-g1.jpg,../data/images/PMC5746550__rsob-7-170228-g1.jpg,,6334,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
3,PMC4211692__pone.0110875.g008.jpg,Figure 8,"Enida Gjoni, et al. PLoS One. 2014;9(10):e110875.",2014,0.966721,142401,,Glucolipotoxicity Impairs Ceramide Flow from t...,Glucolipotoxicity impairs CERT- and vesicular-...,/pmc/articles/PMC4211692/figure/pone-0110875-g...,PMC4211692,pone.0110875.g008.jpg,../data/images/PMC4211692__pone.0110875.g008.jpg,,3808,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
4,PMC2588433__nihms78212f8.jpg,Figure 8,"Amanda L. Lewis, et al. J Biol Chem. ;282(38):...",,0.966758,67398,,NeuA sialic acid O-acetylesterase activity mod...,Bacterial Sia biosynthesis can be divided into...,/pmc/articles/PMC2588433/figure/F8/,PMC2588433,nihms78212f8.jpg,../data/images/PMC2588433__nihms78212f8.jpg,,3790,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64638,PMC4216988__zh20221474360006.jpg,Fig. 6,"Marcelo D. Carattino, et al. Am J Physiol Rena...",2014,0.143076,108774,Hypothetical mechanism of activation of ENaC b...,Prostasin interacts with the epithelial Na+ ch...,Hypothetical mechanism of activation of ENaC b...,/pmc/articles/PMC4216988/figure/F6/,PMC4216988,zh20221474360006.jpg,../data/images/PMC4216988__zh20221474360006.jpg,pathway,77324,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...
64639,PMC2873070__nihms128887f5.jpg,Scheme 1,"Hua Cheng, et al. Neurobiol Aging. ;31(7):1188...",,0.127176,143547,A schematic diagram of a proposed working mode...,Apolipoprotein E mediates sulfatide depletion ...,A schematic diagram of a proposed working mode...,/pmc/articles/PMC2873070/figure/F5/,PMC2873070,nihms128887f5.jpg,../data/images/PMC2873070__nihms128887f5.jpg,pathway,78813,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...
64640,PMC3651446__pnas.1220523110fig06.jpg,Fig. 6,"Jiun-Ming Wu, et al. Proc Natl Acad Sci U S A....",2013,0.055546,159643,Models for nucleation of centrosomal and kinet...,Aurora kinase inhibitors reveal mechanisms of ...,Models for nucleation of centrosomal and kinet...,/pmc/articles/PMC3651446/figure/fig06/,PMC3651446,pnas.1220523110fig06.jpg,../data/images/PMC3651446__pnas.1220523110fig0...,pathway,114977,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...
64641,PMC6770832__cancers-11-01236-g005.jpg,Figure 5,"Carmel Mothersill, et al. Cancers (Basel). 201...",2019,0.140041,618,A simplified TGFŒ≤ pathway leading to p21 expr...,Relevance of Non-Targeted Effects for Radiothe...,A simplified TGFŒ≤ pathway leading to p21 expr...,/pmc/articles/PMC6770832/figure/cancers-11-012...,PMC6770832,cancers-11-01236-g005.jpg,../data/images/PMC6770832__cancers-11-01236-g0...,pathway,77606,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...


In [6]:
# genes
pfocr_genes_rds_url = "https://www.dropbox.com/s/alf7auvxve36oer/pfocr_genes.rds?dl=1"

with tempfile.NamedTemporaryFile(suffix=".rds") as f:
    pfocr_genes_rds_path = f.name
    with requests.get(pfocr_genes_rds_url, stream=True) as r:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
        f.seek(0)
    pfocr_genes_df = rds2pandas(pfocr_genes_rds_path)
pfocr_genes_df

Unnamed: 0,figid,pmcid,word,symbol,source,hgnc_symbol,entrez
0,PMC100003__mb2410470011.jpg,PMC100003,"Ga12,Gaq",G-ALPHA-q,hgnc_alias_symbol,GNAQ,2776
1,PMC100003__mb2410470011.jpg,PMC100003,Etk,ETK,hgnc_alias_symbol,BMX,660
2,PMC100003__mb2410470011.jpg,PMC100003,FAK,FAK,hgnc_alias_symbol,PTK2,5747
3,PMC100003__mb2410470011.jpg,PMC100003,AR*,AR,hgnc_symbol,AR,367
4,PMC100003__mb2410470011.jpg,PMC100003,(Src,SRC,hgnc_symbol,SRC,6714
...,...,...,...,...,...,...,...
1112546,PMC99976__mb2310138007.jpg,PMC99976,MEK-2,MEK2,hgnc_alias_symbol,MAP2K2,5605
1112547,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,HRAS,3265
1112548,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,KRAS,3845
1112549,PMC99976__mb2310138007.jpg,PMC99976,RAS,RAS,bioentities_symbol,NRAS,4893


In [7]:
# nobe_counts
pfocr_nobe_counts_rds_url = (
    "https://www.dropbox.com/s/pd8n5hs4vrvb4st/pfocr_nobe_counts.rds?dl=1"
)

with tempfile.NamedTemporaryFile(suffix=".rds") as f:
    pfocr_nobe_counts_rds_path = f.name
    with requests.get(pfocr_nobe_counts_rds_url, stream=True) as r:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
        f.seek(0)
    pfocr_nobe_counts_df = rds2pandas(pfocr_nobe_counts_rds_path)
pfocr_nobe_counts_df

Unnamed: 0,figid,nobe_count,entrez_count
0,PMC100003__mb2410470011.jpg,5,5
1,PMC100005__mb2410575011.jpg,6,16
2,PMC100008__mb2411709009.jpg,8,52
3,PMC101225__1475-2867-1-1-1.jpg,1,1
4,PMC101242__gkf20707.jpg,13,13
...,...,...,...
58957,PMC99889__mb2110211013.jpg,12,22
58958,PMC99903__mb2110106008.jpg,1,1
58959,PMC99957__mb2310813001.jpg,10,10
58960,PMC99957__mb2310813011.jpg,2,2


In [8]:
# merge the dfs to get pfocr analysis set
pfocr_df = (
    (
        pfocr_figures_df.join(
            pfocr_genes_df.drop(columns=["pmcid"]).set_index("figid"),
            on="figid",
            how="inner",
        ).join(
            pfocr_nobe_counts_df.set_index("figid"), on="figid", how="inner",
        )
    )
    .sort_values(["year", "pmcid", "figid", "entrez"])
    .reset_index(drop=True)
)
pfocr_df

Unnamed: 0,figid,number,reftext,year,pathway_score,pmc_ranked_result_index,figtitle,papertitle,caption,figlink,...,type.man,automl_index,figurl,word,symbol,source,hgnc_symbol,entrez,nobe_count,entrez_count
0,PMC1201394__nihms3677f4.jpg,Fig. 4,"Marissa J. Carstens, et al. J Biol Chem. ;279(...",,0.894122,38017,,Cell Cycle Arrest and Cell Death Are Controlle...,"A, Mdm2 is negatively regulated by p19Arf. As ...",/pmc/articles/PMC1201394/figure/F4/,...,,39679,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,(Cdkn2a,CDKN2A,hgnc_symbol,CDKN2A,1029,6,7
1,PMC1201394__nihms3677f4.jpg,Fig. 4,"Marissa J. Carstens, et al. J Biol Chem. ;279(...",,0.894122,38017,,Cell Cycle Arrest and Cell Death Are Controlle...,"A, Mdm2 is negatively regulated by p19Arf. As ...",/pmc/articles/PMC1201394/figure/F4/,...,,39679,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,p19/Arf,ARF,hgnc_alias_symbol,CDKN2A,1029,6,7
2,PMC1201394__nihms3677f4.jpg,Fig. 4,"Marissa J. Carstens, et al. J Biol Chem. ;279(...",,0.894122,38017,,Cell Cycle Arrest and Cell Death Are Controlle...,"A, Mdm2 is negatively regulated by p19Arf. As ...",/pmc/articles/PMC1201394/figure/F4/,...,,39679,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,G1/S,Gs,bioentities_symbol,GNAL,2774,6,7
3,PMC1201394__nihms3677f4.jpg,Fig. 4,"Marissa J. Carstens, et al. J Biol Chem. ;279(...",,0.894122,38017,,Cell Cycle Arrest and Cell Death Are Controlle...,"A, Mdm2 is negatively regulated by p19Arf. As ...",/pmc/articles/PMC1201394/figure/F4/,...,,39679,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,G1/S,Gs,bioentities_symbol,GNAS,2778,6,7
4,PMC1201394__nihms3677f4.jpg,Fig. 4,"Marissa J. Carstens, et al. J Biol Chem. ;279(...",,0.894122,38017,,Cell Cycle Arrest and Cell Death Are Controlle...,"A, Mdm2 is negatively regulated by p19Arf. As ...",/pmc/articles/PMC1201394/figure/F4/,...,,39679,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,Mdm2,MDM2,hgnc_symbol,MDM2,4193,6,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1112546,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,TRIM41,TRIM41,hgnc_symbol,TRIM41,90933,24,28
1112547,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,RNF185,RNF185,hgnc_symbol,RNF185,91445,24,28
1112548,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,CGAS,CGAS,hgnc_symbol,CGAS,115004,24,28
1112549,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,TTLL6,TTLL6,hgnc_symbol,TTLL6,284076,24,28


In [9]:
gene_counts_df = (
    pfocr_df[["figid", "entrez"]]
    .drop_duplicates()
    .groupby("figid")
    .count()
    .rename(columns={"entrez": "unique_gene_count"})
)
pfocr_df = pfocr_df.join(gene_counts_df, on="figid")

Limit to figures having 10 or more unique genes.

In [10]:
cutoff_pfocr_df = pfocr_df[pfocr_df["unique_gene_count"] >= 10].copy(deep=True)
cutoff_pfocr_df

Unnamed: 0,figid,number,reftext,year,pathway_score,pmc_ranked_result_index,figtitle,papertitle,caption,figlink,...,automl_index,figurl,word,symbol,source,hgnc_symbol,entrez,nobe_count,entrez_count,unique_gene_count
16,PMC1249490__nihms5296f3.jpg,Fig. 3,Robin S.B. Williams. Prog Neuropsychopharmacol...,,0.967952,94832,The primary targets of lithium in the cell are...,Pharmacogenetics in model systems: Defining a ...,The primary targets of lithium in the cell are...,/pmc/articles/PMC1249490/figure/F3/,...,3162,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,GS,Gs,bioentities_symbol,GNAL,2774,4,33,33
17,PMC1249490__nihms5296f3.jpg,Fig. 3,Robin S.B. Williams. Prog Neuropsychopharmacol...,,0.967952,94832,The primary targets of lithium in the cell are...,Pharmacogenetics in model systems: Defining a ...,The primary targets of lithium in the cell are...,/pmc/articles/PMC1249490/figure/F3/,...,3162,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,GS,Gs,bioentities_symbol,GNAS,2778,4,33,33
18,PMC1249490__nihms5296f3.jpg,Fig. 3,Robin S.B. Williams. Prog Neuropsychopharmacol...,,0.967952,94832,The primary targets of lithium in the cell are...,Pharmacogenetics in model systems: Defining a ...,The primary targets of lithium in the cell are...,/pmc/articles/PMC1249490/figure/F3/,...,3162,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,PLC,PLC,bioentities_symbol,PLCB2,5330,4,33,33
19,PMC1249490__nihms5296f3.jpg,Fig. 3,Robin S.B. Williams. Prog Neuropsychopharmacol...,,0.967952,94832,The primary targets of lithium in the cell are...,Pharmacogenetics in model systems: Defining a ...,The primary targets of lithium in the cell are...,/pmc/articles/PMC1249490/figure/F3/,...,3162,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,PLC,PLC,bioentities_symbol,PLCB3,5331,4,33,33
20,PMC1249490__nihms5296f3.jpg,Fig. 3,Robin S.B. Williams. Prog Neuropsychopharmacol...,,0.967952,94832,The primary targets of lithium in the cell are...,Pharmacogenetics in model systems: Defining a ...,The primary targets of lithium in the cell are...,/pmc/articles/PMC1249490/figure/F3/,...,3162,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,PLC,PLC,bioentities_symbol,PLCB4,5332,4,33,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1112546,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,TRIM41,TRIM41,hgnc_symbol,TRIM41,90933,24,28,28
1112547,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,RNF185,RNF185,hgnc_symbol,RNF185,91445,24,28,28
1112548,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,CGAS,CGAS,hgnc_symbol,CGAS,115004,24,28,28
1112549,PMC6906418__12276_2019_299_Fig2_HTML.jpg,Fig. 2,"Hyun-Cheol Lee, et al. Exp Mol Med. 2019 Dec;5...",2019,0.828043,65444,,Intracellular sensing of viral genomes and vir...,Schematic presentation of positive and negativ...,/pmc/articles/PMC6906418/figure/Fig2/,...,50139,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,TTLL6,TTLL6,hgnc_symbol,TTLL6,284076,24,28,28


In [11]:
print(len(cutoff_pfocr_df["figurl"].unique()))
print(len(cutoff_pfocr_df["figid"].unique()))
print(len(cutoff_pfocr_df["figtitle"].unique()))
print(len(cutoff_pfocr_df["caption"].unique()))
print(len(cutoff_pfocr_df["papertitle"].unique()))

32275
32277
23450
32210
28593


## Export GMT

Goal is to mimic the GMT format we use for WikiPathways:
```
FABP4 in ovarian cancer%WikiPathways_20200210%WP4400%Homo sapiens	http://www.wikipathways.org/instance/WP4400_r108112	574413	2167
```

TODO: there are some characters that could mess up the parsing of a GMT. For example, a `%` or a `\t` would be parsed as a delimiter. Also, if a doublequote `"` can be used to enclose fields with characters like spaces, then a title containing a doublequote would mess up a parser.

In those cases, do we want to escape the special characters or remove them?

In [12]:
def genes_to_gmt(df):
    return df["entrez"].astype("str").str.cat(sep="\t")

In [13]:
gmt_df = (
    cutoff_pfocr_df[["figid", "figurl", "figtitle"]]
    .drop_duplicates()
    .set_index("figid")
    .copy(deep=True)
)
gmt_df["version"] = "PFOCR_20200224"
gmt_df["organism"] = "Homo sapiens"
gmt_df["genes"] = (
    cutoff_pfocr_df.set_index("figid").groupby("figid").apply(genes_to_gmt)
)
gmt_df = gmt_df.fillna("")

gmt_df

Unnamed: 0_level_0,figurl,figtitle,version,organism,genes
figid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
PMC1249490__nihms5296f3.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,The primary targets of lithium in the cell are...,PFOCR_20200224,Homo sapiens,2774\t2778\t5330\t5331\t5332\t5333\t5335\t5336...
PMC1307498__nihms5498f3.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,Schematic model of insulin-stimulated glucose ...,PFOCR_20200224,Homo sapiens,207\t207\t208\t3667\t5163\t5164\t5165\t5166\t5...
PMC1307511__nihms2079f3.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,,PFOCR_20200224,Homo sapiens,317\t834\t835\t836\t837\t838\t839\t840\t841\t8...
PMC1351030__nihms2404f8.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,A model of signaling pathway from oncogenic H-...,PFOCR_20200224,Homo sapiens,595\t688\t1432\t1958\t3265\t5594\t5595\t5599\t...
PMC1352153__nihms-7536-0007.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...,Model for GalR1 neuroplasticity within the LC,PFOCR_20200224,Homo sapiens,466\t1385\t1386\t1390\t2587\t4236\t5566\t5567\...
...,...,...,...,...,...
PMC6904862__WJSC-11-1104-g003.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Biomechanical stress stimulates the mechanosen...,PFOCR_20200224,Homo sapiens,207\t208\t1432\t5594\t5595\t5599\t5600\t5601\t...
PMC6904864__WJSC-11-1084-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Small molecules guiding mesenchymal stem cell ...,PFOCR_20200224,Homo sapiens,207\t208\t649\t650\t651\t652\t653\t654\t655\t6...
PMC6905007__40246_2019_252_Fig2_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,LRF/ZBTB7A‚Äôs silencing compromises Warburg e...,PFOCR_20200224,Homo sapiens,355\t596\t834\t835\t836\t837\t838\t839\t840\t8...
PMC6906418__12276_2019_299_Fig1_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,,PFOCR_20200224,Homo sapiens,834\t835\t836\t837\t838\t839\t840\t841\t842\t8...


Let's export the GMT file. The paths match AR's system, but you'll need to update them if yours aren't the same.

In [41]:
dropbox_dir = Path("~/Dropbox (Gladstone)").expanduser().resolve()
paper_dir = dropbox_dir.joinpath("Documents/PFOCR_25Years").expanduser().resolve()
data_dir = paper_dir.joinpath("raw")

if not data_dir.exists():
    tmp_dir = tempfile.TemporaryDirectory()
    # can explicitly remove tmp_dir with tmp_dir.cleanup()
    data_dir = Path(tmp_dir.name)

gmt_path = data_dir.joinpath("pfocr20200224.gmt")

with open(gmt_path, "w") as gmt_f:
    for figid, row in gmt_df.iterrows():
        gmt_f.write(
            "\t".join(
                [
                    "%".join([row["figtitle"], figid, row["version"], row["organism"]]),
                    row["figurl"],
                    row["genes"],
                ]
            )
        )
        gmt_f.write("\n")

print(f"Output saved: {gmt_path}")

Output saved: /run/user/1000/tmp_yz5_q44/pfocr20200224.gmt
