In [1]:
%load_ext lab_black

In [2]:
import json
import ndex2.client
import os
import numpy as np
import pandas as pd
from pathlib import Path
import pprint

from caching import cache_from_dropbox

In [3]:
pp = pprint.PrettyPrinter(indent=4)

In [None]:
import getpass

ndex_user = os.environ.get("NDEX_USER")
ndex_pwd = os.environ.get("NDEX_PWD")

if (not ndex_user):
    print("Please enter your NDEx username:")
    ndex_user=input()
    
if (not ndex_pwd):
    print("Please enter your NDEx password:")
    ndex_pwd = getpass.getpass()

networkset_ids_by_name = dict()

if ndex_user == "wikipathways":
    # for prod
    ndex_url = "http://www.ndexbio.org"
    networkset_ids_by_name[
        "Published Pathway Figures - Analysis Set"
    ] = "85034b42-de8a-11ea-99da-0ac135e8bacf"
    networkset_ids_by_name[
        "WikiPathways Collection - Homo sapiens"
    ] = "453c1c63-5c10-11e9-9f06-0ac135e8bacf"
    networkset_ids_by_name[
        "CPTAC Cancer Hallmark Networks"
    ] = "9541cc61-4cf0-11e9-9f06-0ac135e8bacf"
elif ndex_user:
    # for test/dev
    ndex_url = "http://test.ndexbio.org"
    networkset_ids_by_name[
        "Published Pathway Figures - Analysis Set"
    ] = "8970df33-d6bd-11ea-9101-0660b7976219"
    networkset_ids_by_name[
        "wikipathways-gpml-Homo_sapiens"
    ] = "b44b7ca7-4da1-11e9-9fc6-0660b7976219"
    
print(f"Running as {ndex_user} on {ndex_url}")

## NDEx

Let's get the current user's info from NDEx to check that we can connect correctly:

In [5]:
anon_ndex = ndex2.client.Ndex2(ndex_url)

In [6]:
anon_ndex.get_user_by_username(ndex_user)

{'properties': {},
 'isIndividual': True,
 'userName': 'wikipathways',
 'isVerified': True,
 'firstName': 'WikiPathways',
 'lastName': 'Project',
 'image': 'https://upload.wikimedia.org/wikipedia/commons/8/83/Wplogo_with_text_500.png',
 'website': 'https://www.wikipathways.org',
 'description': '<p></p><h6>Welcome to the NDEx repository for <a href="https://wikipathways.org" target="">WikiPathways</a>. Here we provide network versions of our curated and approved pathway content. <br/>All our content is freely available under the CC0 waiver. Feel free to clone, download, reuse, adapt and publish. Here\'s <a href="https://www.wikipathways.org/index.php/How_to_cite_WikiPathways" target="">how to cite us</a>. <br/></h6><h6><b>For WikiPathways Authors</b>: to add new WikiPathways models or edit the original existing ones, please visit <a href="https://wikipathways.org">our website.</a> <br/></h6>',
 'externalId': '363f49e0-4cf0-11e9-9f06-0ac135e8bacf',
 'isDeleted': False,
 'modificationTim

Now we'll log in as the current user:

In [7]:
my_ndex = ndex2.client.Ndex2(ndex_url, ndex_user, ndex_pwd)

And we'll get the info for the PFOCR network set:

In [8]:
pfocr_networkset_id = networkset_ids_by_name["Published Pathway Figures - Analysis Set"]

pfocr_networkset_data = my_ndex.get_networkset(pfocr_networkset_id)
pfocr_networkset_data.keys()

dict_keys(['name', 'description', 'ownerId', 'networks', 'showcased', 'properties', 'externalId', 'isDeleted', 'modificationTime', 'creationTime'])

In [9]:
len(pfocr_networkset_data["networks"])

32263

In [10]:
sample_network = my_ndex.get_network_summary(pfocr_networkset_data["networks"][0])
sample_network.keys()



I think we can't search for network by `pfocr_id` (`figid`) because we didn't specify these networks should be indexed:
```
Visibility: Public (not searchable)
```

Was this just for test.ndexbio.org or for production too? I think we only index the network set, not the individual networks contained.

How should we find `pfocr_id` by `network_id`? What does `include_groups` mean?

In [12]:
my_ndex.search_networks(
    search_string="PMC5868458__fphys-09-00170-g0003.jpg",
    account_name=ndex_user,
    start=0,
    size=100,
    include_groups=True,
)

{'numFound': 0, 'start': 0, 'networks': []}

## Map `pfocr_id` to `network_id`

Let's get the mappings from `pfocr_id` (`figid`) to `network_id`:

TODO: is there a better endpoint to call for this now? Maybe look into the following:
>Get Network Summaries By UUIDs
>
>/batch/network/summary?accesskey={accessKey}

https://home.ndexbio.org/using-the-ndex-server-api/

In [13]:
cached_pfocr_id_to_network_id_filepath = (
    f"../data/{ndex_user}_pfocr_id_to_network_id.json"
)

pfocr_id_to_network_id = dict()

cached_pfocr_id_to_network_id_f = (
    Path(cached_pfocr_id_to_network_id_filepath).expanduser().resolve()
)

if cached_pfocr_id_to_network_id_f.exists():
    with open(cached_pfocr_id_to_network_id_filepath, "r") as f:
        pfocr_id_to_network_id = json.load(f)
else:
    for network_id in pfocr_networkset_data["networks"]:
        network_summary = my_ndex.get_network_summary(network_id)
        properties = network_summary["properties"]

        pfocr_id = next(p for p in properties if p["predicateString"] == "pfocr_id")[
            "value"
        ]

        if pfocr_id != network_summary["name"]:
            raise Exception(
                f"Expected pfocr_id {pfocr_id} to equal network_summary['name'] {network_summary['name']}"
            )

        if not pfocr_id in pfocr_id_to_network_id:
            pfocr_id_to_network_id[pfocr_id] = network_id
        else:
            raise Exception(
                f"pfocr_id_to_network_id[{pfocr_id}] already set: {pfocr_id_to_network_id[pfocr_id]}"
            )

    with open(cached_pfocr_id_to_network_id_filepath, "w") as f:
        json.dump(pfocr_id_to_network_id, f)

In [14]:
list(pfocr_id_to_network_id.items())[0]

('PMC6372626__41467_2019_8576_Fig6_HTML.jpg',
 '0626a9a9-df0c-11ea-99da-0ac135e8bacf')

## Load PFOCR Data

In [15]:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

from rpy2.robjects.conversion import localconverter

pandas2ri.activate()
base = importr("base")
readRDS = ro.r["readRDS"]



### figures

In [16]:
analysis_set_figure_ids_url = (
    "https://www.dropbox.com/s/r7kc2hwzou3r2gm/analysis_set_figure_ids.tsv?dl=1"
)
analysis_set_figure_ids_f = Path(
    "~/Dropbox (Gladstone)/Documents/PFOCR_25Years/analysis_set_figure_ids.tsv"
).expanduser()
cache_from_dropbox(url=analysis_set_figure_ids_url, dest=analysis_set_figure_ids_f)

analysis_set_figure_ids = set()
with open(analysis_set_figure_ids_f, "r") as f:
    for line in f.read().splitlines():
        analysis_set_figure_ids.add(line)

len(analysis_set_figure_ids)

/home/ariutta/Dropbox (Gladstone)/Documents/PFOCR_25Years/analysis_set_figure_ids.tsv exists. Assuming already cached.


32279

In [17]:
pfocr_figures_rds_url = (
    "https://www.dropbox.com/s/n5j6vrd1v93ve05/pfocr_figures.rds?dl=1"
)
pfocr_figures_f = "~/Dropbox (Gladstone)/Documents/PFOCR_25Years/pfocr_figures.rds"
cache_from_dropbox(url=pfocr_figures_rds_url, dest=pfocr_figures_f)
pfocr_figures_rdf = readRDS(pfocr_figures_f)
with localconverter(ro.default_converter + pandas2ri.converter):
    pfocr_figures_df = ro.conversion.rpy2py(pfocr_figures_rdf).rename(
        columns={
            "figid": "pfocr_id",
            "year": "publication_year",
            "number": "figure_number",
            "figtitle": "figure_title",
            "papertitle": "paper_title",
        }
    )

pfocr_figures_df["paper_link"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pfocr_figures_df["pmcid"]
)

# pfocr_figures_df["figure_link"] = (
#    "https://www.ncbi.nlm.nih.gov/pmc/articles/"
#    + pfocr_figures_df["pmcid"]
#    + "/bin/"
#    + pfocr_figures_df["filename"]
# )

pfocr_figures_df["figure_link"] = (
    "https://www.ncbi.nlm.nih.gov" + pfocr_figures_df["figlink"]
)


pfocr_figures_df.head(2)

~/Dropbox (Gladstone)/Documents/PFOCR_25Years/pfocr_figures.rds exists. Assuming already cached.


Unnamed: 0,pfocr_id,pmcid,filename,publication_year,pathway_score,pmc_ranked_result_index,figlink,source_f,type.man,automl_index,reftext,paper_title,figure_title,figure_number,caption,organism,paper_link,figure_link
1,PMC5653847__41598_2017_14124_Fig8_HTML.jpg,PMC5653847,41598_2017_14124_Fig8_HTML.jpg,2017,0.96827,133303,/pmc/articles/PMC5653847/figure/Fig8/,../data/images/PMC5653847__41598_2017_14124_Fi...,,3012,"Céline Barthelemy, et al. Sci Rep. 2017;7:13816.",FTY720-induced endocytosis of yeast and human ...,Model of FTY720-induced transporter endocytosi...,Figure 8,Model of FTY720-induced transporter endocytosi...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...
2,PMC4187043__zh20191474070013.jpg,PMC4187043,zh20191474070013.jpg,2014,0.965793,79929,/pmc/articles/PMC4187043/figure/F13/,../data/images/PMC4187043__zh20191474070013.jpg,,4323,"Yuan Wei, et al. Am J Physiol Renal Physiol. 2...",Angiotensin II type 2 receptor regulates ROMK-...,Stimulatory effect of ANG II on ROMK channel a...,Fig. 13,Proposed signaling pathway by which the stimul...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...


In [18]:
print(len(analysis_set_figure_ids))
print(len(set(pfocr_figures_df["pfocr_id"]) - analysis_set_figure_ids))
print(len(analysis_set_figure_ids - set(pfocr_figures_df["pfocr_id"])))

32279
32364
0


Add column `network_id` (for NDEx):

In [19]:
pfocr_figures_df["network_id"] = pfocr_figures_df["pfocr_id"].apply(
    lambda pfocr_id: pfocr_id_to_network_id.get(pfocr_id, None)
)
pfocr_figures_df[pfocr_figures_df["network_id"].notnull()].head(2)

Unnamed: 0,pfocr_id,pmcid,filename,publication_year,pathway_score,pmc_ranked_result_index,figlink,source_f,type.man,automl_index,reftext,paper_title,figure_title,figure_number,caption,organism,paper_link,figure_link,network_id
2,PMC4187043__zh20191474070013.jpg,PMC4187043,zh20191474070013.jpg,2014,0.965793,79929,/pmc/articles/PMC4187043/figure/F13/,../data/images/PMC4187043__zh20191474070013.jpg,,4323,"Yuan Wei, et al. Am J Physiol Renal Physiol. 2...",Angiotensin II type 2 receptor regulates ROMK-...,Stimulatory effect of ANG II on ROMK channel a...,Fig. 13,Proposed signaling pathway by which the stimul...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,45985f21-de8d-11ea-99da-0ac135e8bacf
8,PMC2839263__bph0159-1051-f1.jpg,PMC2839263,bph0159-1051-f1.jpg,2010,0.963343,42250,/pmc/articles/PMC2839263/figure/fig01/,../data/images/PMC2839263__bph0159-1051-f1.jpg,,5754,"Gunnar Schulte, et al. Br J Pharmacol. 2010 Ma...",β-arrestins – scaffolds and signalling element...,Beta-arrestin related to the oncogene WNT1 sig...,Figure 1,Schematic summary of possible localization of ...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...,469f95f4-de8d-11ea-99da-0ac135e8bacf


Let's add some columns we know we'll use

In [20]:
pfocr_figures_df[
    "methods"
] = "Pathway Figure OCR, https://www.biorxiv.org/content/10.1101/2020.05.29.124503v1.full "

pfocr_figures_df["rights"] = "Waiver-No rights reserved (CC0)"

pfocr_figures_df["networkType"] = "pathway"

## the following takes too much memory
# pfocr_df["reference"] = (
#    pfocr_df["reftext"].astype("str")
#    + "<b>"
#    + pfocr_df["paper_title"].astype("str")
#    + '</b> <a href="'
#    + pfocr_df["paper_link"].astype("str")
#    + 'target="_blank">'
#    + pfocr_df["paper_link"].astype("str")
#    + "</a>"
# )

### genes

In [21]:
pfocr_genes_rds_url = "https://www.dropbox.com/s/alf7auvxve36oer/pfocr_genes.rds?dl=1"
pfocr_genes_f = "~/Dropbox (Gladstone)/Documents/PFOCR_25Years/pfocr_genes.rds"
cache_from_dropbox(url=pfocr_genes_rds_url, dest=pfocr_genes_f)
pfocr_genes_rdf = readRDS(pfocr_genes_f)
with localconverter(ro.default_converter + pandas2ri.converter):
    pfocr_genes_df = ro.conversion.rpy2py(pfocr_genes_rdf)

pfocr_genes_df.head(2)

~/Dropbox (Gladstone)/Documents/PFOCR_25Years/pfocr_genes.rds exists. Assuming already cached.


Unnamed: 0,figid,pmcid,word,symbol,source,hgnc_symbol,entrez
1,PMC100003__mb2410470011.jpg,PMC100003,"Ga12,Gaq",G-ALPHA-q,hgnc_alias_symbol,GNAQ,2776
2,PMC100003__mb2410470011.jpg,PMC100003,Etk,ETK,hgnc_alias_symbol,BMX,660


### merge

In [22]:
# merge the dfs to get pfocr analysis set
pfocr_df = (
    (
        pfocr_figures_df.join(
            # dropping it because it already exists in pfocr_figures_df
            pfocr_genes_df.drop(columns=["pmcid"])
            .rename(columns={"figid": "pfocr_id"})
            .set_index("pfocr_id"),
            on="pfocr_id",
            how="inner",
        )
    )
    .sort_values(["publication_year", "pmcid", "pfocr_id", "entrez"])
    .reset_index(drop=True)
)

pfocr_df.head()

Unnamed: 0,pfocr_id,pmcid,filename,publication_year,pathway_score,pmc_ranked_result_index,figlink,source_f,type.man,automl_index,...,figure_link,network_id,methods,rights,networkType,word,symbol,source,hgnc_symbol,entrez
0,PMC6134364__GE-4-357-g005.jpg,PMC6134364,GE-4-357-g005.jpg,1995,0.882552,115615,/pmc/articles/PMC6134364/figure/fig5/,../data/images/PMC6134364__GE-4-357-g005.jpg,,42272,...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,a6088798-df17-11ea-99da-0ac135e8bacf,"Pathway Figure OCR, https://www.biorxiv.org/co...",Waiver-No rights reserved (CC0),pathway,elF-2-,ELF2,hgnc_symbol,ELF2,1998
1,PMC6134364__GE-4-357-g005.jpg,PMC6134364,GE-4-357-g005.jpg,1995,0.882552,115615,/pmc/articles/PMC6134364/figure/fig5/,../data/images/PMC6134364__GE-4-357-g005.jpg,,42272,...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,a6088798-df17-11ea-99da-0ac135e8bacf,"Pathway Figure OCR, https://www.biorxiv.org/co...",Waiver-No rights reserved (CC0),pathway,Interferon,Interferon,bioentities_symbol,IFNA1,3439
2,PMC6134364__GE-4-357-g005.jpg,PMC6134364,GE-4-357-g005.jpg,1995,0.882552,115615,/pmc/articles/PMC6134364/figure/fig5/,../data/images/PMC6134364__GE-4-357-g005.jpg,,42272,...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,a6088798-df17-11ea-99da-0ac135e8bacf,"Pathway Figure OCR, https://www.biorxiv.org/co...",Waiver-No rights reserved (CC0),pathway,Interferon,Interferon,bioentities_symbol,IFNA2,3440
3,PMC6134364__GE-4-357-g005.jpg,PMC6134364,GE-4-357-g005.jpg,1995,0.882552,115615,/pmc/articles/PMC6134364/figure/fig5/,../data/images/PMC6134364__GE-4-357-g005.jpg,,42272,...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,a6088798-df17-11ea-99da-0ac135e8bacf,"Pathway Figure OCR, https://www.biorxiv.org/co...",Waiver-No rights reserved (CC0),pathway,Interferon,Interferon,bioentities_symbol,IFNA4,3441
4,PMC6134364__GE-4-357-g005.jpg,PMC6134364,GE-4-357-g005.jpg,1995,0.882552,115615,/pmc/articles/PMC6134364/figure/fig5/,../data/images/PMC6134364__GE-4-357-g005.jpg,,42272,...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,a6088798-df17-11ea-99da-0ac135e8bacf,"Pathway Figure OCR, https://www.biorxiv.org/co...",Waiver-No rights reserved (CC0),pathway,Interferon,Interferon,bioentities_symbol,IFNA5,3442


In [23]:
list(pfocr_df.columns)

['pfocr_id',
 'pmcid',
 'filename',
 'publication_year',
 'pathway_score',
 'pmc_ranked_result_index',
 'figlink',
 'source_f',
 'type.man',
 'automl_index',
 'reftext',
 'paper_title',
 'figure_title',
 'figure_number',
 'caption',
 'organism',
 'paper_link',
 'figure_link',
 'network_id',
 'methods',
 'rights',
 'networkType',
 'word',
 'symbol',
 'source',
 'hgnc_symbol',
 'entrez']

### Compare sets: PFOCR networks already on NDEx vs. analysis set

In [24]:
pfocr_on_ndex_df = pfocr_figures_df[pfocr_figures_df["network_id"].notnull()]
print(len(analysis_set_figure_ids))
print(len(set(pfocr_on_ndex_df["pfocr_id"])))
print(len(set(pfocr_on_ndex_df["pfocr_id"]).intersection(analysis_set_figure_ids)))
print(len(set(pfocr_on_ndex_df["pfocr_id"]) - analysis_set_figure_ids))
print(len(analysis_set_figure_ids - set(pfocr_on_ndex_df["pfocr_id"])))

32279
32263
32263
0
16


### What predicateStrings are we using on NDEx?

In [25]:
properties_by_predicate_string = dict()
for i, row in (
    pfocr_figures_df[pfocr_figures_df["network_id"].notnull()].head(1000).iterrows()
):
    network_id = row["network_id"]
    pfocr_id = row["pfocr_id"]

    network_properties = my_ndex.get_network_summary(network_id)["properties"]
    for network_property in network_properties:
        predicate_string = network_property["predicateString"]
        data_type = network_property["dataType"]
        sub_network_id = network_property["subNetworkId"]
        if not predicate_string in properties_by_predicate_string:
            properties_by_predicate_string[predicate_string] = {
                "subNetworkId": set(),
                "dataType": set(),
            }
        properties_by_predicate_string[predicate_string]["subNetworkId"].add(
            sub_network_id
        )
        properties_by_predicate_string[predicate_string]["dataType"].add(data_type)

# pp.pprint(properties_by_predicate_string)

network_properties_template = list()
network_property_data_types_by_predicate_string = dict()
for k, v in properties_by_predicate_string.items():
    v["predicateString"] = k
    v["value"] = ""

    sub_network_ids = list(v["subNetworkId"])
    if len(sub_network_ids) > 1:
        raise Exception(f"{k} has more than one subNetworkId")
    else:
        v["subNetworkId"] = sub_network_ids[0]

    data_types = list(v["dataType"])
    if len(data_types) > 1:
        raise Exception(f"{k} has more than one dataType")
    else:
        v["dataType"] = data_types[0]

    network_property_data_types_by_predicate_string[k] = data_types[0]

    network_properties_template.append(v)
pp.pprint(network_property_data_types_by_predicate_string)

{   'disease': 'string',
    'figureLink': 'string',
    'figureNumber': 'string',
    'figureTitle': 'string',
    'methods': 'string',
    'networkType': 'string',
    'organism': 'string',
    'paperLink': 'string',
    'paperTitle': 'string',
    'pfocrId': 'string',
    'pmcId': 'string',
    'publicationYear': 'string',
    'reference': 'string',
    'rights': 'string'}


### Remove networks from NDEx if they are not in the analysis set

In [26]:
def remove_network_id(network_id):
    pfocr_figures_df.loc[
        (
            pfocr_figures_df["network_id"] == network_id,
            "network_id",
        )
    ] = None

    del pfocr_id_to_network_id[pfocr_id]
    with open(cached_pfocr_id_to_network_id_filepath, "w") as f:
        json.dump(pfocr_id_to_network_id, f)

    pfocr_on_ndex_df = pfocr_figures_df[pfocr_figures_df["network_id"].notnull()]

    pfocr_df.loc[
        (
            pfocr_df["network_id"] == network_id,
            "network_id",
        )
    ] = None


for pfocr_id in (
    set(pfocr_figures_df[pfocr_figures_df["network_id"].notnull()]["pfocr_id"])
    - analysis_set_figure_ids
):
    network_id = pfocr_id_to_network_id[pfocr_id]

    my_ndex.set_read_only(network_id, False)
    my_ndex.delete_network(network_id)

    remove_network_id(network_id)

### Update properties of remaining networks to match `pfocr_df`

Still need to add networks from analysis set if they are not on NDEx.

In [30]:
import copy
import time
import math

data_type_by_predicate_string = {
    "disease": "string",
    "figureLink": "string",
    "figureNumber": "string",
    "figureTitle": "string",
    "methods": "string",
    "networkType": "string",
    "organism": "string",
    "paperLink": "string",
    "paperTitle": "string",
    "pfocrId": "string",
    "pmcId": "string",
    "publicationYear": "string",
    "reference": "string",
    "rights": "string",
}
column_name_by_predicate_string = {
    # "disease": "disease",
    "figureLink": "figure_link",
    "figureNumber": "figure_number",
    "figureTitle": "figure_title",
    "methods": "methods",
    # "networkType": "network_type",
    "organism": "organism",
    "paperLink": "paper_link",
    "paperTitle": "paper_title",
    "pfocrId": "pfocr_id",
    "pmcId": "pmcid",
    "publicationYear": "publication_year",
    # TODO: reference is combo of columns
    # "reference": "reference",
    # "rights": "rights"
}

column_names = column_name_by_predicate_string.values()

with open("./update_pfocr.log", "w") as f:
    f.write("")


def update_networks(
    my_ndex,
    pfocr_figures_df,
    consecutive_fails_limit=1000,
    no_progress_iterations_limit=3,
    completed=set(),
    consecutive_fails=0,
    no_progress_iterations=0,
):
    msg = f"attempt {no_progress_iterations + 1} of {no_progress_iterations_limit + 1}"
    print(msg)
    with open("./update_pfocr.log", "a") as f:
        f.write(msg + "\n")
    initial_completed_count = len(completed)
    remaining_pfocr_figures_df = pfocr_figures_df[
        pfocr_figures_df["network_id"].notnull()
        & (~pfocr_figures_df["network_id"].isin(completed))
    ]
    for i, row in pfocr_figures_df[pfocr_figures_df["network_id"].notnull()].iterrows():
        network_id = row["network_id"]
        pfocr_id = row["pfocr_id"]
        organism = row["organism"]
        if row["organism"] in set({"XXX", "REJECT"}):
            raise Exception(
                f"organism for network {network_id} / {pfocr_id} is {organism}"
            )

        # Remove items where organism is only a non-human organism?
        # e.g., PMC186276__46891-2f3_L1TT.jpg
        # OK:
        #   'XXX' or 'REJECT'
        #   contains 'Homo sapiens'

        try:
            network_properties = my_ndex.get_network_summary(network_id)["properties"]
            old_network_properties = copy.deepcopy(network_properties)
        except Exception as e:
            print(f"Can't get network_id {network_id}")

            consecutive_fails += 1
            if consecutive_fails >= consecutive_fails_limit:
                raise Exception(f"Failed for too many. Stopping.")
            else:
                print(f"Skipping {network_id} and moving to next.")
                continue

        network_property_by_predicate_string = dict()
        for network_property in network_properties:
            predicate_string = network_property["predicateString"]

            # in case it's still using snake_case, let's go to lowerCamelCase
            if predicate_string in column_name_by_predicate_string:
                predicate_string = column_name_by_predicate_string[predicate_string]
            network_property["predicateString"] = predicate_string

            if predicate_string == "reference":
                paper_title = row["paper_title"]
                paper_link = row["paper_link"]
                reftext = row["reftext"]
                network_property[
                    "value"
                ] = f'{reftext} <b>{paper_title}</b> <a href="{paper_link}" target="_blank">{paper_link}</a>'

            network_property_by_predicate_string[predicate_string] = network_property

        updated_network_properties = list()
        for predicate_string, data_type in data_type_by_predicate_string.items():
            if predicate_string in column_name_by_predicate_string:
                column_name = column_name_by_predicate_string[predicate_string]
            else:
                column_name = None

            if (
                (not column_name is None)
                and column_name in column_names
                and (not row[column_name] is None)
            ):
                updated_network_property = {
                    "predicateString": predicate_string,
                    "value": row[column_name],
                    "dataType": data_type,
                    "subNetworkId": None,
                }
            elif predicate_string in network_property_by_predicate_string:
                updated_network_property = network_property_by_predicate_string[
                    predicate_string
                ]
            else:
                continue

            if updated_network_property:
                updated_network_properties.append(updated_network_property)

        new_predicate_strings = set()
        for network_property in updated_network_properties:
            new_predicate_string = network_property["predicateString"]
            if new_predicate_string in new_predicate_strings:
                raise Exception(
                    f"my predicateString {new_predicate_string} already specified for {network_id}"
                )
            new_predicate_strings.add(new_predicate_string)

        try:
            my_ndex.set_read_only(network_id, False)
            my_ndex.update_network_profile(
                network_id,
                {"name": row["figure_title"], "description": row["caption"]},
            )
            my_ndex.set_network_properties(network_id, updated_network_properties)
            my_ndex.set_read_only(network_id, True)

            no_progress_iterations = 0
            consecutive_fails = 0
            completed.add(network_id)
            with open("./update_pfocr.log", "a") as f:
                f.write(network_id + "\n")
        except Exception as e:
            print(f"Failed for network_id {network_id}")

            consecutive_fails += 1
            if consecutive_fails >= consecutive_fails_limit:
                raise Exception(f"Failed for too many. Stopping.")
            else:
                print(f"Skipping {network_id} and moving to next.")
                continue

    if len(set(remaining_pfocr_figures_df["network_id"]) - completed) == 0:
        return 0
    elif no_progress_iterations >= no_progress_iterations_limit:
        return 1
    else:
        if len(completed) == initial_completed_count:
            # If we didn't make any progress, wait before trying again.
            # Use exponential back off:
            # 7^0 x 5 = 5 min
            # 7^1 x 5 = 35 min
            # 7^2 x 5 = ~4 hrs
            # 7^3 x 5 = ~29 hrs

            backoff_base = 7
            backoff_factor = 5  # minutes
            wait_sec = (
                math.pow(backoff_base, no_progress_iterations) * backoff_factor * 60
            )
            print(f"waiting for {int(wait_sec / 60)} min before trying again")
            time.sleep(wait_sec)

            # we want to try a limited number of times without making progress
            no_progress_iterations += 1

        total_network_count = len(
            pfocr_figures_df[pfocr_figures_df["network_id"].notnull()]
        )
        print(
            f"completed {len(completed) - initial_completed_count} this round for a total of {len(completed)} out of {total_network_count}"
        )

        return update_networks(
            my_ndex,
            pfocr_figures_df,
            consecutive_fails_limit,
            no_progress_iterations_limit,
            completed,
            consecutive_fails,
            no_progress_iterations,
        )

Run it:

In [None]:
update_networks(my_ndex, pfocr_figures_df)

attempt 1 of 4


## Pathway Disease Associations

Alex did a gene enrichment analysis on the PFOCR data to get pathway-disease associations. He provided the data [here](https://github.com/wikipathways/pathway-figure-ocr/issues/16#issuecomment-611684935).

In [26]:
pathway_disease_url = (
    "https://www.dropbox.com/s/vgazcxdq4wsl5yk/pfocr_disease_map.tsv?dl=0"
)

In [27]:
pfocr_disease_map_f = "../data/pfocr_disease_map.tsv"
cache_from_dropbox(url=pathway_disease_url, dest=pfocr_disease_map_f)

../data/pfocr_disease_map.tsv exists. Assuming already cached.


In [28]:
pfocr_disease_df = pd.read_csv(pfocr_disease_map_f, sep="\t").rename(columns={"figid": "pfocr_id"})
pfocr_disease_df

Unnamed: 0,pfocr_id,terms,doid
0,PMC100008__mb2411709009.jpg,Cancer,DOID:162
1,PMC100008__mb2411709009.jpg,Primary hyperaldosteronism,DOID:12252
2,PMC100008__mb2411709009.jpg,Lung cancer,DOID:1324
3,PMC100008__mb2411709009.jpg,Noonan syndrome,DOID:3490
4,PMC101242__gkf20707.jpg,Breast cancer,DOID:1612
...,...,...,...
72877,PMC99889__mb2110211013.jpg,Cancer,DOID:162
72878,PMC99889__mb2110211013.jpg,Cardiomyopathy,DOID:0050700
72879,PMC99889__mb2110211013.jpg,Lung cancer,DOID:1324
72880,PMC99889__mb2110211013.jpg,Noonan syndrome,DOID:3490


In [29]:
disease_pfocr_ids = set(pfocr_disease_df["pfocr_id"])
len(disease_pfocr_ids)

23331

In [38]:
pfocr_disease_df["network_id"] = pfocr_disease_df["pfocr_id"].apply(
    lambda pfocr_id: pfocr_id_to_network_id.get(pfocr_id, None)
)
pfocr_disease_df[pfocr_disease_df["network_id"].notnull()]

Unnamed: 0,pfocr_id,terms,doid,network_id
12,PMC1052007__JCI0524159.f1.jpg,Primary hyperaldosteronism,DOID:12252,b1499264-ddd6-11ea-9101-0660b7976219
13,PMC1052008__JCI0524178.f3.jpg,Alzheimer's disease,DOID:10652,bd676e7b-de55-11ea-9101-0660b7976219
14,PMC1052008__JCI0524178.f3.jpg,Primary hyperaldosteronism,DOID:12252,bd676e7b-de55-11ea-9101-0660b7976219
15,PMC1052008__JCI0524178.f3.jpg,Ovarian cancer,DOID:2394,bd676e7b-de55-11ea-9101-0660b7976219
16,PMC1069556__zjv0080560770006.jpg,Cancer,DOID:162,536cbd18-ddd1-11ea-9101-0660b7976219
...,...,...,...,...
72846,PMC86511__mb2301061009.jpg,Aortic aneurysm,DOID:3627,48162d84-de59-11ea-9101-0660b7976219
72847,PMC86511__mb2301061009.jpg,Coloboma,DOID:12270,48162d84-de59-11ea-9101-0660b7976219
72848,PMC86511__mb2301061009.jpg,Microphthalmia,DOID:10629,48162d84-de59-11ea-9101-0660b7976219
72849,PMC86592__mb0211421009.jpg,Rheumatoid arthritis,DOID:7148,4a5e0e91-ddd2-11ea-9101-0660b7976219


In [39]:
for i, df in pfocr_disease_df[pfocr_disease_df["network_id"].notnull()].groupby(
    by="pfocr_id"
):
    network_id = df["network_id"].iloc[0]
    pfocr_id = df["pfocr_id"].iloc[0]

    # TODO: are either of these calls fine?
    my_ndex.set_read_only(network_id, False)
    # my_ndex.set_network_system_properties(network_id, {"readOnly": False})

    network_properties = my_ndex.get_network_summary(network_id)["properties"]

    # Here's a sample of a disease property value from a WikiPathways network:
    # <a href="https://identifiers.org/doid/DOID:332">ALS</a>, <a href="https://identifiers.org/doid/DOID:332">amyotrophic lateral sclerosis</a>

    disease_links = list()
    for i, subdf in df[["terms", "doid"]].iterrows():
        term = subdf["terms"]
        doid = subdf["doid"]
        disease_links.append(
            f'<a href="https://identifiers.org/doid/{doid}">{term}</a>'
        )

    disease_value = ", ".join(disease_links)

    disease_property = next(
        (x for x in network_properties if x["predicateString"] == "disease"), None
    )
    if disease_property:
        disease_property["value"] = disease_value
    else:
        network_properties.append(
            {
                "subNetworkId": "",
                "predicateString": "disease",
                "dataType": "string",
                "value": disease_value,
            }
        )

    my_ndex.set_network_properties(network_id, network_properties)

    # my_ndex.set_network_system_properties(network_id, {"readOnly": True})
    my_ndex.set_read_only(network_id, True)

KeyboardInterrupt: 