In [1]:
%load_ext lab_black

Look at using this for jupyterlab-connect on macOS:
```
caffeinate -i -s /usr/bin/ssh nixos
```

or maybe:
```
caffeinate -i -s /usr/local/bin/mosh nixos
```

or maybe:
```
caffeinate -i -s /Users/andersriutta/jupyterlab-connect/bin/jupyterlab-connect nixos:Documents/pfocr2ndex
```

In [2]:
import json
import ndex2.client
import os
import numpy as np
import pandas as pd
from pathlib import Path
import pprint

from caching import cache_from_dropbox

In [3]:
pp = pprint.PrettyPrinter(indent=4)

In [4]:
import getpass

ndex_user = os.environ.get("NDEX_USER")
ndex_pwd = os.environ.get("NDEX_PWD")

if not ndex_user:
    print("Please enter your NDEx username:")
    ndex_user = input()

if not ndex_pwd:
    print("Please enter your NDEx password:")
    ndex_pwd = getpass.getpass()

networkset_ids_by_name = dict()

if ndex_user == "wikipathways":
    # for prod
    ndex_url = "http://www.ndexbio.org"
    networkset_ids_by_name[
        "Published Pathway Figures - Analysis Set"
    ] = "85034b42-de8a-11ea-99da-0ac135e8bacf"
    networkset_ids_by_name[
        "WikiPathways Collection - Homo sapiens"
    ] = "453c1c63-5c10-11e9-9f06-0ac135e8bacf"
    networkset_ids_by_name[
        "CPTAC Cancer Hallmark Networks"
    ] = "9541cc61-4cf0-11e9-9f06-0ac135e8bacf"
elif ndex_user:
    # for test/dev
    ndex_url = "http://test.ndexbio.org"
    networkset_ids_by_name[
        "Published Pathway Figures - Analysis Set"
    ] = "8970df33-d6bd-11ea-9101-0660b7976219"
    networkset_ids_by_name[
        "wikipathways-gpml-Homo_sapiens"
    ] = "b44b7ca7-4da1-11e9-9fc6-0660b7976219"

print(f"Running as {ndex_user} on {ndex_url}")

Please enter your NDEx username:


 wikipathways


Please enter your NDEx password:


 ······


Running as wikipathways on http://www.ndexbio.org


## NDEx

Let's get the current user's info from NDEx to check that we can connect correctly:

In [5]:
anon_ndex = ndex2.client.Ndex2(ndex_url)

In [6]:
anon_ndex.get_user_by_username(ndex_user)

{'properties': {},
 'isIndividual': True,
 'userName': 'wikipathways',
 'isVerified': True,
 'firstName': 'WikiPathways',
 'lastName': 'Project',
 'image': 'https://upload.wikimedia.org/wikipedia/commons/8/83/Wplogo_with_text_500.png',
 'website': 'https://www.wikipathways.org',
 'description': '<p></p><h6>Welcome to the NDEx repository for <a href="https://wikipathways.org" target="">WikiPathways</a>. Here we provide network versions of our curated and approved pathway content. <br/>All our content is freely available under the CC0 waiver. Feel free to clone, download, reuse, adapt and publish. Here\'s <a href="https://www.wikipathways.org/index.php/How_to_cite_WikiPathways" target="">how to cite us</a>. <br/></h6><h6><b>For WikiPathways Authors</b>: to add new WikiPathways models or edit the original existing ones, please visit <a href="https://wikipathways.org">our website.</a> <br/></h6>',
 'externalId': '363f49e0-4cf0-11e9-9f06-0ac135e8bacf',
 'isDeleted': False,
 'modificationTim

Now we'll log in as the current user:

In [7]:
my_ndex = ndex2.client.Ndex2(ndex_url, ndex_user, ndex_pwd)

And we'll get the info for the PFOCR network set:

In [8]:
pfocr_networkset_id = networkset_ids_by_name["Published Pathway Figures - Analysis Set"]

pfocr_networkset_data = my_ndex.get_networkset(pfocr_networkset_id)
pfocr_networkset_data.keys()

dict_keys(['name', 'description', 'ownerId', 'networks', 'showcased', 'properties', 'externalId', 'isDeleted', 'modificationTime', 'creationTime'])

In [9]:
len(pfocr_networkset_data["networks"])

32263

In [10]:
sample_network = my_ndex.get_network_summary(pfocr_networkset_data["networks"][0])
sample_network.keys()



I think we can't search for network by `pfocr_id` (`figid`) because we didn't specify these networks should be indexed:
```
Visibility: Public (not searchable)
```

Was this just for test.ndexbio.org or for production too? I think we only index the network set, not the individual networks contained.

How should we find `pfocr_id` by `network_id`? What does `include_groups` mean?

In [11]:
my_ndex.search_networks(
    search_string="PMC5868458__fphys-09-00170-g0003.jpg",
    account_name=ndex_user,
    start=0,
    size=100,
    include_groups=True,
)

{'numFound': 0, 'start': 0, 'networks': []}

## Map `pfocr_id` to `network_id`

Let's get the mappings from `pfocr_id` (`figid`) to `network_id`:

TODO: is there a better endpoint to call for this now? Maybe look into the following:
>Get Network Summaries By UUIDs
>
>/batch/network/summary?accesskey={accessKey}

https://home.ndexbio.org/using-the-ndex-server-api/

In [12]:
cached_pfocr_id_to_network_id_filepath = (
    f"../data/{ndex_user}_pfocr_id_to_network_id.json"
)

pfocr_id_to_network_id = dict()

cached_pfocr_id_to_network_id_f = (
    Path(cached_pfocr_id_to_network_id_filepath).expanduser().resolve()
)

if cached_pfocr_id_to_network_id_f.exists():
    with open(cached_pfocr_id_to_network_id_filepath, "r") as f:
        pfocr_id_to_network_id = json.load(f)
else:
    for network_id in pfocr_networkset_data["networks"]:
        network_summary = my_ndex.get_network_summary(network_id)
        properties = network_summary["properties"]

        pfocr_id = next(p for p in properties if p["predicateString"] == "pfocr_id")[
            "value"
        ]

        if pfocr_id != network_summary["name"]:
            raise Exception(
                f"Expected pfocr_id {pfocr_id} to equal network_summary['name'] {network_summary['name']}"
            )

        if not pfocr_id in pfocr_id_to_network_id:
            pfocr_id_to_network_id[pfocr_id] = network_id
        else:
            raise Exception(
                f"pfocr_id_to_network_id[{pfocr_id}] already set: {pfocr_id_to_network_id[pfocr_id]}"
            )

    with open(cached_pfocr_id_to_network_id_filepath, "w") as f:
        json.dump(pfocr_id_to_network_id, f)

In [13]:
list(pfocr_id_to_network_id.items())[0]

('PMC6372626__41467_2019_8576_Fig6_HTML.jpg',
 '0626a9a9-df0c-11ea-99da-0ac135e8bacf')

## Load PFOCR Data

In [14]:
import pandas as pd
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri

from rpy2.robjects.conversion import localconverter

pandas2ri.activate()
base = importr("base")
readRDS = ro.r["readRDS"]



### figures

In [15]:
analysis_set_figure_ids_url = (
    "https://www.dropbox.com/s/r7kc2hwzou3r2gm/analysis_set_figure_ids.tsv?dl=1"
)
analysis_set_figure_ids_f = Path(
    "~/Dropbox (Gladstone)/Documents/PFOCR_25Years/analysis_set_figure_ids.tsv"
).expanduser()
cache_from_dropbox(url=analysis_set_figure_ids_url, dest=analysis_set_figure_ids_f)

analysis_set_figure_ids = set()
with open(analysis_set_figure_ids_f, "r") as f:
    for line in f.read().splitlines():
        analysis_set_figure_ids.add(line)

len(analysis_set_figure_ids)

/home/ariutta/Dropbox (Gladstone)/Documents/PFOCR_25Years/analysis_set_figure_ids.tsv exists. Assuming already cached.


32279

In [16]:
pfocr_figures_rds_url = (
    "https://www.dropbox.com/s/n5j6vrd1v93ve05/pfocr_figures.rds?dl=1"
)
pfocr_figures_f = "~/Dropbox (Gladstone)/Documents/PFOCR_25Years/pfocr_figures.rds"
cache_from_dropbox(url=pfocr_figures_rds_url, dest=pfocr_figures_f)
pfocr_figures_rdf = readRDS(pfocr_figures_f)
with localconverter(ro.default_converter + pandas2ri.converter):
    pfocr_figures_df = (
        ro.conversion.rpy2py(pfocr_figures_rdf)
        .rename(
            columns={
                "figid": "pfocr_id",
                "year": "publication_year",
                "number": "figure_number",
                "figtitle": "figure_title",
                "papertitle": "paper_title",
            }
        )
        .sort_values(["publication_year", "pmcid", "pfocr_id"])
    )

pfocr_figures_df["paper_link"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/" + pfocr_figures_df["pmcid"]
)

# pfocr_figures_df["figure_page_url"] = (
#    "https://www.ncbi.nlm.nih.gov/pmc/articles/"
#    + pfocr_figures_df["pmcid"]
#    + "/bin/"
#    + pfocr_figures_df["filename"]
# )

pfocr_figures_df["figure_page_url"] = (
    "https://www.ncbi.nlm.nih.gov" + pfocr_figures_df["figlink"]
)

pfocr_figures_df["figure_thumbnail_url"] = (
    "https://www.ncbi.nlm.nih.gov/pmc/articles/"
    + pfocr_figures_df["pmcid"]
    + "/bin/"
    + pfocr_figures_df["filename"]
)


pfocr_figures_df.head(2)

~/Dropbox (Gladstone)/Documents/PFOCR_25Years/pfocr_figures.rds exists. Assuming already cached.


Unnamed: 0,pfocr_id,pmcid,filename,publication_year,pathway_score,pmc_ranked_result_index,figlink,source_f,type.man,automl_index,reftext,paper_title,figure_title,figure_number,caption,organism,paper_link,figure_page_url,figure_thumbnail_url
52325,PMC6134364__GE-4-357-g005.jpg,PMC6134364,GE-4-357-g005.jpg,1995,0.882552,115615,/pmc/articles/PMC6134364/figure/fig5/,../data/images/PMC6134364__GE-4-357-g005.jpg,,42272,Joe B. Harford. Gene Expr. 1995;4(6):357-367.,Translation-Targeted Therapeutics for Viral Di...,PKR pathway,FIG. 5,The PKR pathway. Double-stranded RNA (dsRNA) r...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...
32576,PMC6138031__GE-5-01-g002.jpg,PMC6138031,GE-5-01-g002.jpg,1995,0.953649,120261,/pmc/articles/PMC6138031/figure/fig2/,../data/images/PMC6138031__GE-5-01-g002.jpg,,12632,"Kimberly C. Gilmour, et al. Gene Expr. 1995;5(...",Signal Transduction and Activation of Gene Tra...,Illustrative model of the IFN signal transduct...,FIG. 2,Illustrative model of the IFN signal transduct...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...


In [17]:
print(len(analysis_set_figure_ids))
print(len(set(pfocr_figures_df["pfocr_id"]) - analysis_set_figure_ids))
print(len(analysis_set_figure_ids - set(pfocr_figures_df["pfocr_id"])))

32279
32364
0


Add column `network_id` (for NDEx):

In [18]:
pfocr_figures_df["network_id"] = pfocr_figures_df["pfocr_id"].apply(
    lambda pfocr_id: pfocr_id_to_network_id.get(pfocr_id, None)
)
pfocr_figures_df[pfocr_figures_df["network_id"].notnull()].head(2)

Unnamed: 0,pfocr_id,pmcid,filename,publication_year,pathway_score,pmc_ranked_result_index,figlink,source_f,type.man,automl_index,reftext,paper_title,figure_title,figure_number,caption,organism,paper_link,figure_page_url,figure_thumbnail_url,network_id
52325,PMC6134364__GE-4-357-g005.jpg,PMC6134364,GE-4-357-g005.jpg,1995,0.882552,115615,/pmc/articles/PMC6134364/figure/fig5/,../data/images/PMC6134364__GE-4-357-g005.jpg,,42272,Joe B. Harford. Gene Expr. 1995;4(6):357-367.,Translation-Targeted Therapeutics for Viral Di...,PKR pathway,FIG. 5,The PKR pathway. Double-stranded RNA (dsRNA) r...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,a6088798-df17-11ea-99da-0ac135e8bacf
32576,PMC6138031__GE-5-01-g002.jpg,PMC6138031,GE-5-01-g002.jpg,1995,0.953649,120261,/pmc/articles/PMC6138031/figure/fig2/,../data/images/PMC6138031__GE-5-01-g002.jpg,,12632,"Kimberly C. Gilmour, et al. Gene Expr. 1995;5(...",Signal Transduction and Activation of Gene Tra...,Illustrative model of the IFN signal transduct...,FIG. 2,Illustrative model of the IFN signal transduct...,Homo sapiens,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,994fae28-deee-11ea-99da-0ac135e8bacf


Let's add some columns we know we'll use

In [19]:
pfocr_figures_df[
    "methods"
] = "Pathway Figure OCR, https://www.biorxiv.org/content/10.1101/2020.05.29.124503v1.full "

pfocr_figures_df["rights"] = "Waiver-No rights reserved (CC0)"

pfocr_figures_df["network_type"] = "pathway"

## the following takes too much memory
# pfocr_df["reference"] = (
#    pfocr_df["reftext"].astype("str")
#    + "<b>"
#    + pfocr_df["paper_title"].astype("str")
#    + '</b> <a href="'
#    + pfocr_df["paper_link"].astype("str")
#    + 'target="_blank">'
#    + pfocr_df["paper_link"].astype("str")
#    + "</a>"
# )

### Pathway Disease Associations

Alex did a gene enrichment analysis on the PFOCR data to get pathway-disease associations. He provided the data [here](https://github.com/wikipathways/pathway-figure-ocr/issues/16#issuecomment-611684935).

In [20]:
pathway_disease_url = (
    "https://www.dropbox.com/s/vgazcxdq4wsl5yk/pfocr_disease_map.tsv?dl=0"
)

In [21]:
pfocr_disease_map_f = "../data/pfocr_disease_map.tsv"
cache_from_dropbox(url=pathway_disease_url, dest=pfocr_disease_map_f)

../data/pfocr_disease_map.tsv exists. Assuming already cached.


In [22]:
pfocr_disease_df = pd.read_csv(pfocr_disease_map_f, sep="\t").rename(
    columns={"figid": "pfocr_id"}
)
pfocr_disease_df

Unnamed: 0,pfocr_id,terms,doid
0,PMC100008__mb2411709009.jpg,Cancer,DOID:162
1,PMC100008__mb2411709009.jpg,Primary hyperaldosteronism,DOID:12252
2,PMC100008__mb2411709009.jpg,Lung cancer,DOID:1324
3,PMC100008__mb2411709009.jpg,Noonan syndrome,DOID:3490
4,PMC101242__gkf20707.jpg,Breast cancer,DOID:1612
...,...,...,...
72877,PMC99889__mb2110211013.jpg,Cancer,DOID:162
72878,PMC99889__mb2110211013.jpg,Cardiomyopathy,DOID:0050700
72879,PMC99889__mb2110211013.jpg,Lung cancer,DOID:1324
72880,PMC99889__mb2110211013.jpg,Noonan syndrome,DOID:3490


In [23]:
disease_pfocr_ids = set(pfocr_disease_df["pfocr_id"])
len(disease_pfocr_ids)

23331

### Genes

### Merge

### Compare sets: PFOCR networks already on NDEx vs. analysis set

In [24]:
pfocr_on_ndex_df = pfocr_figures_df[pfocr_figures_df["network_id"].notnull()]
print(len(analysis_set_figure_ids))
print(len(set(pfocr_on_ndex_df["pfocr_id"])))
print(len(set(pfocr_on_ndex_df["pfocr_id"]).intersection(analysis_set_figure_ids)))
print(len(set(pfocr_on_ndex_df["pfocr_id"]) - analysis_set_figure_ids))
print(len(analysis_set_figure_ids - set(pfocr_on_ndex_df["pfocr_id"])))

32279
32263
32263
0
16


### What predicateStrings are we using on NDEx?

In [25]:
properties_by_predicate_string = dict()
for i, row in (
    pfocr_figures_df[pfocr_figures_df["network_id"].notnull()].head(1000).iterrows()
):
    network_id = row["network_id"]
    pfocr_id = row["pfocr_id"]

    network_properties = my_ndex.get_network_summary(network_id)["properties"]
    for network_property in network_properties:
        predicate_string = network_property["predicateString"]
        data_type = network_property["dataType"]
        sub_network_id = network_property["subNetworkId"]
        if not predicate_string in properties_by_predicate_string:
            properties_by_predicate_string[predicate_string] = {
                "subNetworkId": set(),
                "dataType": set(),
            }
        properties_by_predicate_string[predicate_string]["subNetworkId"].add(
            sub_network_id
        )
        properties_by_predicate_string[predicate_string]["dataType"].add(data_type)

# pp.pprint(properties_by_predicate_string)

network_properties_template = list()
network_property_data_types_by_predicate_string = dict()
for k, v in properties_by_predicate_string.items():
    v["predicateString"] = k
    v["value"] = ""

    sub_network_ids = list(v["subNetworkId"])
    if len(sub_network_ids) > 1:
        raise Exception(f"{k} has more than one subNetworkId")
    else:
        v["subNetworkId"] = sub_network_ids[0]

    data_types = list(v["dataType"])
    if len(data_types) > 1:
        raise Exception(f"{k} has more than one dataType")
    else:
        v["dataType"] = data_types[0]

    network_property_data_types_by_predicate_string[k] = data_types[0]

    network_properties_template.append(v)
pp.pprint(network_property_data_types_by_predicate_string)

{   'figureLink': 'string',
    'figureNumber': 'string',
    'figureTitle': 'string',
    'methods': 'string',
    'networkType': 'string',
    'organism': 'string',
    'paperLink': 'string',
    'paperTitle': 'string',
    'pfocrId': 'string',
    'pmcId': 'string',
    'publicationYear': 'string',
    'reference': 'string',
    'rights': 'string'}


### Remove networks from PFOCR networkset on NDEx if they are not in the analysis set

In [26]:
def remove_network_id(network_id):
    pfocr_figures_df.loc[
        (
            pfocr_figures_df["network_id"] == network_id,
            "network_id",
        )
    ] = None

    del pfocr_id_to_network_id[pfocr_id]
    with open(cached_pfocr_id_to_network_id_filepath, "w") as f:
        json.dump(pfocr_id_to_network_id, f)

    pfocr_on_ndex_df = pfocr_figures_df[pfocr_figures_df["network_id"].notnull()]

    pfocr_df.loc[
        (
            pfocr_df["network_id"] == network_id,
            "network_id",
        )
    ] = None


for pfocr_id in (
    set(pfocr_figures_df[pfocr_figures_df["network_id"].notnull()]["pfocr_id"])
    - analysis_set_figure_ids
):
    network_id = pfocr_id_to_network_id[pfocr_id]

    my_ndex.set_read_only(network_id, False)
    my_ndex.delete_network(network_id)

    remove_network_id(network_id)

### Update properties of remaining networks to match `pfocr_figures_df`

Still need to add networks from analysis set if they are not on NDEx.

In [29]:
import time
import math

data_type_by_predicate_string = {
    "disease": "string",
    "figureLink": "string",
    "figureNumber": "string",
    "figureTitle": "string",
    "methods": "string",
    "networkType": "string",
    "organism": "string",
    "paperLink": "string",
    "paperTitle": "string",
    "pfocrId": "string",
    "pmcId": "string",
    "publicationYear": "string",
    "reference": "string",
    "rights": "string",
}

# disease and reference don't come directly from a
# pfocr_figures_df column, so we don't include them
# here. We need to use custom logic for them.
column_name_by_predicate_string = {
    # note the difference:
    #     figure_thumbnail_url: URL to the actual JPG of the thumbnail
    #     figure_page_url: URL to the HTML page for the figure
    "figureLink": "figure_page_url",
    "figureNumber": "figure_number",
    "figureTitle": "figure_title",
    "methods": "methods",
    "networkType": "network_type",
    "organism": "organism",
    "paperLink": "paper_link",
    "paperTitle": "paper_title",
    "pfocrId": "pfocr_id",
    "pmcId": "pmcid",
    "publicationYear": "publication_year",
    "rights": "rights",
}

column_names = column_name_by_predicate_string.values()

log_file_path = "./update_pfocr.log"

def update_networks(
    my_ndex,
    remaining_pfocr_figures_df,
    consecutive_fails_count_limit=10,
    no_progress_iterations_count_limit=3,
    completed=set(),
    no_progress_iterations_count=0,
    iteration=0
):
    msg = f"attempt {no_progress_iterations_count + 1} of {no_progress_iterations_count_limit + 1}"
    print(msg)
    
    # overwrite the first time
    with open(log_file_path, "w" if (iteration == 0) else "a") as f:
        f.write(msg + "\n")
        
    consecutive_fails_count = 0
    initial_completed_count = len(completed)
    for i, pfocr_figures_row in remaining_pfocr_figures_df.iterrows():
        network_id = pfocr_figures_row["network_id"]
        pfocr_id = pfocr_figures_row["pfocr_id"]

        try:
            existing_network_properties = my_ndex.get_network_summary(network_id)["properties"]
        except Exception as e:
            with open(log_file_path, "a") as f:
                f.write(f"Can't get network_id {network_id}\n")
                f.write(e + "\n")
            print(f"Can't get network_id {network_id}")
            print(e)

            consecutive_fails_count += 1
            if consecutive_fails_count >= consecutive_fails_count_limit:
                raise Exception(f"Failed for too many. Stopping.")
            else:
                print(f"Skipping {network_id} and moving to next.")
                with open(log_file_path, "a") as f:
                    f.write(f"Skipping {network_id} and moving to next.\n")
                continue

        ###################
        # "auto" properties
        # -----------------
        # "Automatically" translated from dataframe column to network property via column_name_by_predicate_string.
        # Data type comes from data_type_by_predicate_string.
        ###################
        updated_network_properties = list()

        for predicate_string, column_name in column_name_by_predicate_string.items():
            data_type = data_type_by_predicate_string[predicate_string]

            if pfocr_figures_row[column_name] is None:
                print(
                    f"missing_column_name: {column_name} not found in pfocr_figures_row for {pfocr_id} / {network_id}"
                )
                missing_property = next(
                    (
                        x
                        for x in existing_network_properties
                        if x["predicateString"] == predicate_string
                    ),
                    None,
                )
                if missing_property:
                    # updated_network_properties.append(missing_property)
                    raise Exception(
                        f"{column_name} not found in pfocr_figures_row, BUT {predicate_string} was previously set on NDEx for {pfocr_id} / {network_id}"
                    )
            else:
                updated_network_properties.append(
                    {
                        "predicateString": predicate_string,
                        "value": pfocr_figures_row[column_name],
                        "dataType": data_type,
                        "subNetworkId": None,
                    }
                )

        ############
        # reference
        ############
        paper_title = pfocr_figures_row["paper_title"]
        paper_link = pfocr_figures_row["paper_link"]
        reftext = pfocr_figures_row["reftext"]
        updated_network_properties.append(
            {
                "predicateString": "reference",
                "value": f'{reftext} <b>{paper_title}</b> <a href="{paper_link}" target="_blank">{paper_link}</a>',
                "dataType": data_type_by_predicate_string["reference"],
                "subNetworkId": None,
            }
        )

        ##########
        # disease
        ##########
        # Here's an example what a disease property value should look like:
        # '<a href="https://identifiers.org/doid/DOID:332">ALS</a>, <a href="https://identifiers.org/doid/DOID:332">amyotrophic lateral sclerosis</a>'

        disease_links = list()
        for i, disease_row in pfocr_disease_df[
            pfocr_disease_df["pfocr_id"] == pfocr_id
        ][["terms", "doid"]].iterrows():
            term = disease_row["terms"]
            doid = disease_row["doid"]
            disease_links.append(
                f'<a href="https://identifiers.org/doid/{doid}">{term}</a>'
            )
        if len(disease_links) > 0:
            updated_network_properties.append(
                {
                    "predicateString": "disease",
                    "dataType": "string",
                    "value": ", ".join(disease_links),
                    "subNetworkId": None,
                }
            )

        try:
            my_ndex.set_read_only(network_id, False)

            # network profile
            caption = pfocr_figures_row["caption"]
            figure_thumbnail_url = pfocr_figures_row["figure_thumbnail_url"]
            description = f"""
<p>{caption}</p>
<p><img src="{figure_thumbnail_url}" style="width: 100%;"></p>
            """
            my_ndex.update_network_profile(
                network_id,
                {"name": pfocr_figures_row["figure_title"], "description": description},
            )

            # network properties
            my_ndex.set_network_properties(network_id, updated_network_properties)

            my_ndex.set_read_only(network_id, True)

            no_progress_iterations_count = 0
            consecutive_fails_count = 0
            completed.add(network_id)
            with open(log_file_path, "a") as f:
                f.write(f"success: {network_id}\n")
        except Exception as e:
            print(f"Failed for network_id {network_id}")
            print(e)
            with open(log_file_path, "a") as f:
                f.write(f"Failed for network_id {network_id}.\n")
                f.write(e + "\n")

            # Let's keep track of the count if we fail for multiple consecutive networks
            # within a single try (or retry) cycle.
            consecutive_fails_count += 1
            if consecutive_fails_count >= consecutive_fails_count_limit:
                print(f"Failed for {consecutive_fails_count} consecutive networks. Stopping.")
                raise e
            else:
                print(f"Skipping {network_id} and moving to next.")
                with open(log_file_path, "a") as f:
                    f.write(f"Skipping {network_id} and moving to next.\n")
                continue

    if len(set(remaining_pfocr_figures_df["network_id"]) - completed) == 0:
        print("All networks now updated.")
        return 0
    elif no_progress_iterations_count >= no_progress_iterations_count_limit:
        print(f"""
            We have tried/retried {no_progress_iterations_count} consecutive times without making any progress.
            We need to stop and take a look at why.
        """)
        return 1
    else:
        if len(completed) == initial_completed_count:
            # If we didn't make any progress in this try (or retry) cycle, wait before trying again.
            # Use exponential back off:
            # 7^0 x 5 = 5 min
            # 7^1 x 5 = 35 min
            # 7^2 x 5 = ~4 hrs
            # 7^3 x 5 = ~29 hrs

            backoff_base = 7
            initial_wait = 5  # minutes
            wait_sec = (
                math.pow(backoff_base, no_progress_iterations_count) * initial_wait * 60
            )
            print(f"waiting for {int(wait_sec / 60)} min before trying again")
            with open(log_file_path, "a") as f:
                f.write(f"waiting for {int(wait_sec / 60)} min before trying again.\n")
            time.sleep(wait_sec)

            # Let's keep track of how many times in a row our retries fail to make any progress.
            no_progress_iterations_count += 1

        remaining_pfocr_figures_df = pfocr_figures_df[
            pfocr_figures_df["network_id"].notnull()
            & (~pfocr_figures_df["network_id"].isin(completed))
        ]
        print(f'''
            Updated {len(completed) - initial_completed_count} networks this round for a total of {len(completed)} from all rounds,
            leaving {len(remaining_pfocr_figures_df)} networks remaining to be updated.
        ''')
        with open(log_file_path, "a") as f:
            f.write(f"Networks updated: {len(completed) - initial_completed_count} this round, {len(completed)} total, {len(remaining_pfocr_figures_df)} remaining.\n")

        return update_networks(
            my_ndex,
            remaining_pfocr_figures_df,
            consecutive_fails_count_limit,
            no_progress_iterations_count_limit,
            completed,
            no_progress_iterations_count,
            iteration+1,
        )

#### Run it:

In [30]:
update_networks(my_ndex, pfocr_figures_df)

attempt 1 of 4
missing_column_name: publication_year not found in pfocr_figures_row for PMC1249490__nihms5296f3.jpg / d8def56c-dec7-11ea-99da-0ac135e8bacf
missing_column_name: publication_year not found in pfocr_figures_row for PMC1307498__nihms5498f3.jpg / 3996467d-df0d-11ea-99da-0ac135e8bacf
missing_column_name: publication_year not found in pfocr_figures_row for PMC1307511__nihms2079f3.jpg / 4957b6d6-df34-11ea-99da-0ac135e8bacf
missing_column_name: publication_year not found in pfocr_figures_row for PMC1351030__nihms2404f8.jpg / bc34e7af-de8d-11ea-99da-0ac135e8bacf
missing_column_name: publication_year not found in pfocr_figures_row for PMC1352153__nihms-7536-0007.jpg / 4a7427d9-de8e-11ea-99da-0ac135e8bacf
missing_column_name: publication_year not found in pfocr_figures_row for PMC1352320__nihms6852f1.jpg / f174c37f-def0-11ea-99da-0ac135e8bacf
missing_column_name: publication_year not found in pfocr_figures_row for PMC1360222__nihms7040f9.jpg / d48f9e7a-deaa-11ea-99da-0ac135e8bacf
m

0

In [None]:
update_networks(my_ndex, (pfocr_figures_df[
    pfocr_figures_df["network_id"].notnull()
]))