In [1]:
from copy import copy
import json
import pandas as pd
import textwrap

# BTE Sleeve

Some queries result in an explosion of the number of results. This is bad for performance of the server and also not very useful to the user. This notebook explores using PFOCR as a "compression sleeve" to dampen these types of explosions. I don't have a CC-licensed image of a compression sleeve, so instead here's an image of a heat-shrink tube.

<a title="oomlout, CC BY-SA 2.0 &lt;https://creativecommons.org/licenses/by-sa/2.0&gt;, via Wikimedia Commons" href="https://commons.wikimedia.org/wiki/File:HESH-06-L-STAN-01_(9601242883).jpg"><img width="512" alt="HESH-06-L-STAN-01 (9601242883)" src="https://upload.wikimedia.org/wikipedia/commons/thumb/d/d3/HESH-06-L-STAN-01_%289601242883%29.jpg/512px-HESH-06-L-STAN-01_%289601242883%29.jpg"></a>

`RAB13 Gene`--entity_regulates_entity|genetically_interacts_with--`Any Gene`--related_to--`Any SmallMolecule`

`n0`--e01--`n1`--e02--`n2`

Note BTE Sleeve currently only works on queries with nodes of categories supported by PFOCR, e.g., `biolink:Gene`, `biolink:SmallMolecule`, `biolink:Disease`. TODO: update the Fisher's Exact test to work with node categories not in PFOCR.

Also, BTE Sleeve currently assumes `n0` has an `ids` parameter. TODO: update to handle `ids` params on any q_node.

In [2]:
import requests
import requests_cache


requests_cache.install_cache("pfocr_cache", allowable_methods=['GET', 'POST'])

## Get BTE TRAPI Results

In [3]:
query = {
    "message": {
        "query_graph": {
            "edges": {
                "e0": {
                    "subject": "n0",
                    "object": "n1",
                    "predicates": [
                        "biolink:entity_regulates_entity",
                        "biolink:genetically_interacts_with"
                    ]
                },
                "e1": {
                    "subject": "n1",
                    "object": "n2",
                    "predicates": [
                        "biolink:related_to"
                    ]
                }
            },
            "nodes": {
                "n0": {
                    "ids": [
                        "NCBIGene:5872"
                    ],
                    "categories": [
                        "biolink:Gene"
                    ]
                },
                "n1": {
                    "categories": [
                        "biolink:Gene"
                    ],
                },
                "n2": {
                    "categories": [
                        "biolink:SmallMolecule"
                    ]
                }
            }
        }
    }
}

trapi_response = requests.post("http://localhost:3000/v1/query", json=query)
#trapi_response = requests.post("https://api.bte.ncats.io/v1/query", json=query)
#trapi_response = requests.post("https://arax.ncats.io/api/rtxkg2/v1.2/query", json=query)
#trapi_response = requests.post("https://arax.ncats.io/api/arax/v1.2/query", json=query)
print(trapi_response.status_code)
if trapi_response.status_code != 200:
    print(trapi_response.text)
if trapi_response.from_cache:
    print(f"got response from cache")

trapi_message = trapi_response.json()["message"]

#print(json.dumps(query))

200
got response from cache


In [4]:
# NOTE: we're also including any categories from the TRAPI results.
# Some categories are supersets of others.
# TODO: maybe handle this systematically?
curie_categories = set()
for v in  query["message"]["query_graph"]["nodes"].values():
    if "categories" in v:
        for category in v["categories"]:
            curie_categories.add(category)

In [5]:
# for genes/gene products, chemicals and diseases
preferred_prefixes = set(["NCBIGene", "MESH"])
unified_names = set()
unified_curies = set()
unification_failed_curies = set()
unified_prefixes = set()
all_prefixes = set()
curie_to_name = dict()
curie_to_unified_curie = dict()
for k, v in trapi_message["knowledge_graph"]["nodes"].items():
    name = v["name"]
    for category in v["categories"]:
        curie_categories.add(category)
        
    for a in v["attributes"]:
        if a["attribute_type_id"] == "biolink:xref":
            curies = a["value"]
            
            # k should always be one of the curies
            if not k in curies:
                raise Exception(f"key {k} not in {curies}")
            
            unified_curie = None
            intersecting_unified_curies = unified_curies.intersection(set(curies))
            if len(intersecting_unified_curies) > 1:
                raise Exception(f"matching multiple: {k} to {list(intersecting_unified_curies)}")
            elif len(intersecting_unified_curies) == 1:
                unified_curie = list(intersecting_unified_curies)[0]
            else:
                # get curie for preferred prefix. usually this is k, but not always.
                for curie in curies:
                    [prefix, identifier] = curie.split(":")
                    if prefix in preferred_prefixes:
                        unified_curie = curie
                        unified_curies.add(unified_curie)
                        break
                        
            if not unified_curie:
                if k in curie_to_unified_curie:
                    unified_curie = curie_to_unified_curie[k]
                else:
                    unification_failed_curies.add(k)
                    break
                    
                #raise Exception(f"failed to find a unified curie for {k} in {curies}")
                
            [unified_prefix, unified_identifier] = unified_curie.split(":")
            unified_prefixes.add(unified_prefix)
            
            for curie in curies:
                [prefix, identifier] = curie.split(":")
                all_prefixes.add(prefix)
                if not curie in curie_to_unified_curie:
                    curie_to_unified_curie[curie] = unified_curie
                if not curie in curie_to_name:
                    curie_to_name[curie] = name
                    unified_names.add(name)
                elif curie_to_name[curie] != name:
                    print(f"curie {curie} has multiple primary names: {curie_to_name[curie]} and {name}")
                    #raise Exception(f"curie{curie} has multiple names: {curie_to_name[curie]} and {name}")


#print(f'curie_categories: {curie_categories}')
#print("")
#print(f"curie_to_name key count: {len(curie_to_name.keys())}")
#print(f"curie_to_unified_curie key count: {len(curie_to_unified_curie.keys())}")
#print("")
print("all CURIE prefixes found:")
print(all_prefixes)
print("")
print("unified CURIE prefixes found:")
print(unified_prefixes)
print("")
print(f"failed to unify {len(unification_failed_curies)} CURIEs")

curie UMLS:C0903898 has multiple primary names: NATEGLINIDE and nateglinide
curie UMLS:C0021641 has multiple primary names: Iletin and Insulin
all CURIE prefixes found:
{'CAS', 'GTOPDB', 'ENSEMBL', 'KEGG.COMPOUND', 'DRUGBANK', 'DrugCentral', 'HMDB', 'PR', 'NCBIGene', 'HGNC', 'CHEMBL.COMPOUND', 'CHEBI', 'UMLS', 'INCHIKEY', 'OMIM', 'UniProtKB', 'UNII', 'MESH', 'PUBCHEM.COMPOUND'}

unified CURIE prefixes found:
{'MESH', 'NCBIGene'}

failed to unify 886 CURIEs


In [6]:
columns = []
q_node_id_keys = set(["object", "subject"])
q_node_ids = []
q_edge_ids = []
for q_edge_id,edge_v in query["message"]["query_graph"]["edges"].items():
    q_edge_ids.append(q_edge_id)
    q_node_id_found = False
    for k,v in edge_v.items():
        if (k in q_node_id_keys) and (type(v) is str):
            if v not in columns:
                q_node_ids.append(v)
                columns.append(v)
            if not q_node_id_found:
                q_node_id_found = True
                columns.append(q_edge_id)
                
for q_node_id in q_node_ids:
    columns.append(f"{q_node_id}_curie")
    columns.append(f"{q_node_id}_unified_curie")
    
columns.append("trapi_result_curie_combo")

trapi_result_columns = []
for i in range(len(q_node_ids)):
    trapi_result_columns.append(q_node_ids[i])
    if i < len(q_edge_ids):
        trapi_result_columns.append(q_edge_ids[i])

In [7]:
from copy import deepcopy


trapi_results = trapi_message["results"]

result_row_data = []
for trapi_result in trapi_results:
    curie_to_qnode_ids = dict()
    for qnode_id, entries in trapi_result["node_bindings"].items():
        for entry in entries:
            curie = entry["id"]
            if curie not in curie_to_qnode_ids:
                curie_to_qnode_ids[curie] = []
            curie_to_qnode_ids[curie].append(qnode_id)
        
    row_data_template = dict()
    q_edge_id_to_predicates = dict()
    trapi_result_curies = set()
    for qedge_id, entries in trapi_result["edge_bindings"].items():
        for entry in entries:
            curie = entry["id"]
            kg_entry = trapi_message["knowledge_graph"]["edges"][curie]
            subject_curie = kg_entry["subject"]
            object_curie = kg_entry["object"]
            predicate_curie = kg_entry["predicate"]
            [predicate_prefix, predicate_identifier] = predicate_curie.split(":")
            
            if qedge_id not in q_edge_id_to_predicates:
                q_edge_id_to_predicates[qedge_id] = set()
            q_edge_id_to_predicates[qedge_id].add(predicate_identifier)

            for curie in [subject_curie, object_curie]:
                for qnode_id in curie_to_qnode_ids[curie]:
                    if curie in curie_to_unified_curie:
                        unified_curie = curie_to_unified_curie[curie]
                    else:
                        break

                    name = curie_to_name[curie]
                    row_data_template[qnode_id] = name

                    trapi_result_curies.add(unified_curie)
                    row_data_template[qnode_id + "_curie"] = curie
                    row_data_template[qnode_id + "_unified_curie"] = unified_curie
                    
    if len(trapi_result_curies) != len(q_node_ids):
        #print(f'skipping {list(curie_to_qnode_ids.keys())}')
        continue
        
    row_data_template["trapi_result_curie_combo"] = tuple(sorted(trapi_result_curies))
    q_edge_ids_processed = set()
    row_datas = [row_data_template]
    for q_edge_id,predicates in q_edge_id_to_predicates.items():
        next_row_datas = []
        for row_data in row_datas:
            for predicate in predicates:
                next_row_data = deepcopy(row_data)
                next_row_data[q_edge_id] = predicate
                next_row_datas.append(
                    next_row_data 
                )
        row_datas = next_row_datas
    result_row_data += row_datas
        
print("warning: predicate direction(s) may be switched")
results_df = pd.DataFrame.from_records(result_row_data, columns=columns).drop_duplicates()
results_df



Unnamed: 0,n0,e0,n1,e1,n2,n0_curie,n0_unified_curie,n1_curie,n1_unified_curie,n2_curie,n2_unified_curie,trapi_result_curie_combo
0,RAB13,entity_positively_regulates_entity,RAB18,entity_positively_regulated_by_entity,WATER,NCBIGene:5872,NCBIGene:5872,NCBIGene:22931,NCBIGene:22931,CHEMBL.COMPOUND:CHEMBL1098659,MESH:D014867,"(MESH:D014867, NCBIGene:22931, NCBIGene:5872)"
1,RAB13,entity_positively_regulates_entity,RAB18,entity_negatively_regulated_by_entity,WATER,NCBIGene:5872,NCBIGene:5872,NCBIGene:22931,NCBIGene:22931,CHEMBL.COMPOUND:CHEMBL1098659,MESH:D014867,"(MESH:D014867, NCBIGene:22931, NCBIGene:5872)"
2,RAB13,entity_positively_regulates_entity,RAB18,entity_positively_regulated_by_entity,Isopropyl-beta-D-thiogalactopyranoside,NCBIGene:5872,NCBIGene:5872,NCBIGene:22931,NCBIGene:22931,PUBCHEM.COMPOUND:656894,MESH:D007544,"(MESH:D007544, NCBIGene:22931, NCBIGene:5872)"
3,RAB13,entity_positively_regulates_entity,RAB18,entity_positively_regulated_by_entity,Trichostatin A,NCBIGene:5872,NCBIGene:5872,NCBIGene:22931,NCBIGene:22931,PUBCHEM.COMPOUND:444732,MESH:C012589,"(MESH:C012589, NCBIGene:22931, NCBIGene:5872)"
4,RAB13,entity_positively_regulates_entity,RAB18,entity_positively_regulated_by_entity,Mannitol,NCBIGene:5872,NCBIGene:5872,NCBIGene:22931,NCBIGene:22931,PUBCHEM.COMPOUND:6251,MESH:D008353,"(MESH:D008353, NCBIGene:22931, NCBIGene:5872)"
...,...,...,...,...,...,...,...,...,...,...,...,...
6125,RAB13,genetically_interacts_with,TGFBR1,coexists_with,5' Flanking Region,NCBIGene:5872,NCBIGene:5872,NCBIGene:7046,NCBIGene:7046,UMLS:C0949645,MESH:D024506,"(MESH:D024506, NCBIGene:5872, NCBIGene:7046)"
6126,RAB13,genetically_interacts_with,TGFBR1,coexists_with,"1,3,4,6-Tetra-O-acetyl-2-azido-2-deoxy-beta-D-...",NCBIGene:5872,NCBIGene:5872,NCBIGene:7046,NCBIGene:7046,PUBCHEM.COMPOUND:133445,MESH:C070423,"(MESH:C070423, NCBIGene:5872, NCBIGene:7046)"
6127,RAB13,genetically_interacts_with,TGFBR1,affects_response_to,Thioacetamide,NCBIGene:5872,NCBIGene:5872,NCBIGene:7046,NCBIGene:7046,PUBCHEM.COMPOUND:2723949,MESH:D013853,"(MESH:D013853, NCBIGene:5872, NCBIGene:7046)"
6128,RAB13,genetically_interacts_with,TGFBR1,increases_response_to,Thioacetamide,NCBIGene:5872,NCBIGene:5872,NCBIGene:7046,NCBIGene:7046,PUBCHEM.COMPOUND:2723949,MESH:D013853,"(MESH:D013853, NCBIGene:5872, NCBIGene:7046)"


Unique CURIE count per query node:

In [8]:
for q_node_id in q_node_ids:
    print(f'{q_node_id}: {len(set(results_df[q_node_id]))}')

n0: 1
n1: 12
n2: 1347


Total results count (excluding any that couldn't be unified):

In [9]:
len(set(results_df["trapi_result_curie_combo"]))

3158

That's too too high for a researcher to efficiently go through manually. Let's try filtering to make that more manageable.

### TODO: NCBIGene for SmallMolecules?
Why are some of the returned TRAPI results using NCBIGene CURIEs for small molecules? 

In [10]:
results_df[(
    results_df["n2_curie"].str.startswith("NCBIGene:")
) | (
    results_df["n2_unified_curie"].str.startswith("NCBIGene:")
)][["n2", "n2_curie", "n2_unified_curie"]]

Unnamed: 0,n2,n2_curie,n2_unified_curie
4724,SMAD3,NCBIGene:4088,NCBIGene:4088


## Match up BTE TRAPI Results & PFOCR

We're going to try using PFOCR to filter and prioritize the results.

First we need to get the PFOCR Data. We could get it from the API, but for now, we'll just go ahead and download the entire JSON file we gave to BTE.

In [11]:
pfocr_url = "https://www.dropbox.com/s/1f14t5zaseocyg6/bte_chemicals_diseases_genes.ndjson?dl=1"
pfocr_request = requests.get(pfocr_url)
print(f"status_code: {pfocr_request.status_code}")
if pfocr_request.status_code != 200:
    print(pfocr_request.text)

status_code: 200


In [12]:
curies_to_figure_ids = {}
figure_id_to_curies = {}
figure_id_to_pfocr_result = {}
for line in pfocr_request.text.splitlines():
    pfocr_result = json.loads(line)
    figure_id = pfocr_result["_id"]

    curies = set()
    for identifier in pfocr_result["associatedWith"]["mentions"]["chemicals"]["mesh"]:
        curie = "MESH:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)
    for identifier in pfocr_result["associatedWith"]["mentions"]["diseases"]["mesh"]:
        curie = "MESH:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)
    for identifier in pfocr_result["associatedWith"]["mentions"]["genes"]["ncbigene"]:
        curie = "NCBIGene:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)

    figure_id_to_pfocr_result[figure_id] = pfocr_result
    figure_id_to_curies[figure_id] = curies

    curies_key = tuple(sorted(curies))
    if curies_key not in curies_to_figure_ids:
        curies_to_figure_ids[curies_key] = []
    curies_to_figure_ids[curies_key].append(figure_id)

In [13]:
from SetSimilaritySearch import SearchIndex

# the reference sets are supposed to be from PFOCR only
reference_sets = list()
for curies in set(curies_to_figure_ids.keys()):
    reference_sets.append(set(curies))
# but in order to calculate the scores correctly,
# the SetSimilaritySearch library requires that
# every curie from the bte results need to be
# mentioned at least once in the reference set.
# That's the only reason we add them below, as
# one large set.

unified_curie_columns = [
    q_node_id + "_unified_curie" for q_node_id in q_node_ids
]

trapi_result_curie_combos = set()
for i, df in results_df[unified_curie_columns].drop_duplicates().iterrows():
    trapi_result_curies = []
    for unified_curie_column in unified_curie_columns:
        trapi_result_curies.append(
            df[unified_curie_column]
        )
    trapi_result_curie_combos.add(tuple(sorted(
        trapi_result_curies
    )))
    
reference_set = set()
for trapi_result_curie_combo in trapi_result_curie_combos:
    reference_set |= set(trapi_result_curie_combo)
reference_sets.append(reference_set)

matchable_node_min = 2
matchable_node_count = len(q_node_ids)
index = SearchIndex(reference_sets, similarity_func_name="containment", 
    similarity_threshold=matchable_node_min/matchable_node_count)

trapi_figure_overlap_rows = []
for trapi_result_curie_combo in trapi_result_curie_combos:
    trapi_curies = set(trapi_result_curie_combo)
    results = index.query(trapi_curies)
    for result in results:
        figure_curie_combos = reference_sets[result[0]]
        figure_curie_key = tuple(sorted(figure_curie_combos))
        
        # needed to not match bte results to themselves
        if figure_curie_key in curies_to_figure_ids:
            figure_ids = curies_to_figure_ids[figure_curie_key]
        else:
            continue
            
        score = result[1]
        
        common_curies = trapi_curies.intersection(figure_curie_combos)
        for figure_id in figure_ids:
            pfocr_result = figure_id_to_pfocr_result[figure_id]
            trapi_figure_overlap_rows.append({
                "figure_id": figure_id,
                "figure_url": pfocr_result["associatedWith"]["figureUrl"],
                "figure_title": pfocr_result["associatedWith"]["title"],
                "trapi_result_curie_combo": trapi_result_curie_combo,
                "figure_curie_combo": figure_curie_key,
                "overlap_curie_combo": tuple(sorted(common_curies)),
                "score": score,
            })


trapi_figure_overlap_df = pd.DataFrame.from_records(trapi_figure_overlap_rows)
trapi_figure_overlap_df

Unnamed: 0,figure_id,figure_url,figure_title,trapi_result_curie_combo,figure_curie_combo,overlap_curie_combo,score
0,PMC5834423__fendo-09-00046-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,MicroRNAs (miRNAs) targeting insulin signaling...,"(MESH:C540576, NCBIGene:201475, NCBIGene:5872)","(MESH:D005947, MESH:D011125, NCBIGene:10000, N...","(NCBIGene:201475, NCBIGene:5872)",0.666667
1,PMC6466823__ml-2018-00593r_0004.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Main pathways and respective genes found in di...,"(MESH:C540576, NCBIGene:201475, NCBIGene:5872)","(MESH:C000589078, MESH:C051905, MESH:C052497, ...","(NCBIGene:201475, NCBIGene:5872)",0.666667
2,PMC7904555__18_2020_3656_Fig2_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Intracellular trafficking pathways of TLR4 and...,"(MESH:C540576, NCBIGene:201475, NCBIGene:5872)","(MESH:C069631, MESH:C411671, NCBIGene:10133, N...","(NCBIGene:201475, NCBIGene:5872)",0.666667
3,PMC7097195__41573_2019_36_Fig2_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Lysosomes as a therapeutic target,"(MESH:C540576, NCBIGene:201475, NCBIGene:5872)","(MESH:C057620, MESH:C116960, MESH:C118025, MES...","(MESH:C540576, NCBIGene:201475, NCBIGene:5872)",1.000000
4,PMC3374753__2352fig7.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,SHRC interaction with retromer on membranes,"(MESH:C540576, NCBIGene:201475, NCBIGene:5872)","(NCBIGene:10890, NCBIGene:10966, NCBIGene:1098...","(NCBIGene:201475, NCBIGene:5872)",0.666667
...,...,...,...,...,...,...,...
66865,PMC5546324__nihms866088f6.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,AKT/PKB Signaling: Navigating the Network,"(MESH:D019787, NCBIGene:3630, NCBIGene:5872)","(MESH:D000249, MESH:D005947, NCBIGene:10000, N...","(NCBIGene:3630, NCBIGene:5872)",0.666667
66866,PMC7884075__elife-58615-fig4.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,GWAS hits in the IGF-1 pathway,"(MESH:D019787, NCBIGene:3630, NCBIGene:5872)","(MESH:D009584, NCBIGene:10000, NCBIGene:10890,...","(NCBIGene:3630, NCBIGene:5872)",0.666667
66867,PMC5518130__12864_2017_3907_Fig4_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,Differently expressed protein-coding genes and...,"(MESH:D019787, NCBIGene:3630, NCBIGene:5872)","(MESH:C031143, MESH:D000105, MESH:D000214, MES...","(NCBIGene:3630, NCBIGene:5872)",0.666667
66868,PMC6770969__genes-10-00728-g006.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,KEGG AMPK signaling,"(MESH:D019787, NCBIGene:3630, NCBIGene:5872)","(MESH:C031143, MESH:C504504, MESH:D000105, MES...","(NCBIGene:3630, NCBIGene:5872)",0.666667


### trapi_curie_combo_count_by_figure_df

In [14]:
trapi_curie_combo_count_by_figure_df = trapi_figure_overlap_df[["figure_id", "trapi_result_curie_combo"]].rename(columns={
    "trapi_result_curie_combo": "trapi_curie_combo_count"
}).groupby("figure_id").count().sort_values(
    by="trapi_curie_combo_count", ascending=False)
trapi_curie_combo_count_by_figure_df

Unnamed: 0_level_0,trapi_curie_combo_count
figure_id,Unnamed: 1_level_1
PMC6406872__cells-08-00192-g003.jpg,1470
PMC6472122__MMR-19-05-3564-g04.jpg,1182
PMC5518130__12864_2017_3907_Fig4_HTML.jpg,1175
PMC6770969__genes-10-00728-g006.jpg,1165
PMC6895093__41598_2019_54408_Fig15_HTML.jpg,1150
...,...
PMC4623426__fncel-09-00426-g0004.jpg,1
PMC4623426__fncel-09-00426-g0003.jpg,1
PMC4620423__WJSC-7-1150-g001.jpg,1
PMC4610971__gr1.jpg,1


### Figure count by TRAPI CURIE combo

In [15]:
trapi_figure_overlap_df[["trapi_result_curie_combo", "figure_id"]].rename(columns={
    "figure_id": "figure_count"
}).groupby("trapi_result_curie_combo").count().sort_values(
    by="figure_count", ascending=False)

Unnamed: 0_level_0,figure_count
trapi_result_curie_combo,Unnamed: 1_level_1
"(NCBIGene:4088, NCBIGene:4089, NCBIGene:5872)",1790
"(MESH:D002784, NCBIGene:22931, NCBIGene:5872)",192
"(MESH:D004317, NCBIGene:22931, NCBIGene:5872)",191
"(MESH:D003561, NCBIGene:22931, NCBIGene:5872)",191
"(MESH:C423915, NCBIGene:22931, NCBIGene:5872)",191
...,...
"(MESH:C085911, NCBIGene:5872, NCBIGene:6789)",1
"(MESH:C085911, NCBIGene:5872, NCBIGene:7316)",1
"(MESH:C554092, NCBIGene:5147, NCBIGene:5872)",1
"(MESH:C506244, NCBIGene:5147, NCBIGene:5872)",1


### Figure containment score by TRAPI CURIE combo

This can yield the same order of results, but if one TRAPI result has higher overlap of CURIEs with the figure(s), that TRAPI result could leapfrog over another TRAPI result that overlaps with more figures but at lower containment scores.

Note the weakness of containment is it doesn't take into account the number of CURIEs in the figure.

In [16]:
trapi_result_figure_score_df = trapi_figure_overlap_df[["trapi_result_curie_combo", "score"]].rename(
    columns={"score": "cumulative_trapi_result_curie_combo_figure_score"}
).groupby("trapi_result_curie_combo").sum().sort_values(
    by="cumulative_trapi_result_curie_combo_figure_score", ascending=False)
trapi_result_figure_score_df

Unnamed: 0_level_0,cumulative_trapi_result_curie_combo_figure_score
trapi_result_curie_combo,Unnamed: 1_level_1
"(NCBIGene:4088, NCBIGene:4089, NCBIGene:5872)",1195.333333
"(MESH:D002784, NCBIGene:22931, NCBIGene:5872)",139.666667
"(MESH:D019821, NCBIGene:201475, NCBIGene:5872)",131.000000
"(MESH:C031143, NCBIGene:22931, NCBIGene:5872)",128.333333
"(MESH:D013395, NCBIGene:22931, NCBIGene:5872)",128.000000
...,...
"(MESH:C085911, NCBIGene:5872, NCBIGene:6789)",0.666667
"(MESH:C085911, NCBIGene:5872, NCBIGene:7316)",0.666667
"(MESH:C554092, NCBIGene:5147, NCBIGene:5872)",0.666667
"(MESH:C506244, NCBIGene:5147, NCBIGene:5872)",0.666667


In [17]:
results_with_figures_df = trapi_figure_overlap_df.merge(
    results_df,
    on="trapi_result_curie_combo",
    how="left").merge(
    trapi_result_figure_score_df.reset_index(),
    on="trapi_result_curie_combo",
    how="left",
    validate="many_to_one"
).merge(
    trapi_curie_combo_count_by_figure_df.reset_index().rename(columns={
        "trapi_curie_combo_count": "trapi_curie_combo_count_by_figure"
    }),
    on="figure_id",
    how="left",
    validate="many_to_one"
).sort_values(
    by="cumulative_trapi_result_curie_combo_figure_score", ascending=False
)

## Fisher's Exact Test

### TODO: vectorize
For better performance, try vectorizing the calculation of Fisher's Exact test, e.g.:

https://stackoverflow.com/questions/34947578/how-to-vectorize-fishers-exact-test

In [18]:
all_genes = set()
gene_instance_count = 0
all_chemicals = set()
chemical_instance_count = 0
all_diseases = set()
disease_instance_count = 0
for figure_id, pfocr_result in figure_id_to_pfocr_result.items():
    genes = pfocr_result["associatedWith"]["mentions"]["genes"]["ncbigene"]
    for gene in genes:
        gene_instance_count += 1
        all_genes.add(gene)
        
    chemicals = pfocr_result["associatedWith"]["mentions"]["chemicals"]["mesh"]
    for chemical in chemicals:
        chemical_instance_count += 1
        all_chemicals.add(chemical)
        
    diseases = pfocr_result["associatedWith"]["mentions"]["diseases"]["mesh"]
    for disease in diseases:
        disease_instance_count += 1
        all_diseases.add(disease)
    
print("total instance counts:")
print(f'  chemicals: {chemical_instance_count}')
print(f'  diseases: {disease_instance_count}')
print(f'  genes: {gene_instance_count}')

print("")

print("unique counts:")
print(f'  chemicals: {len(all_chemicals)}')
print(f'  diseases: {len(all_diseases)}')
print(f'  genes: {len(all_genes)}')

print("")

curie_category_to_curies = {
    "biolink:ChemicalEntity": all_chemicals,
    "biolink:SmallMolecule": all_chemicals,
    "biolink:Disease": all_diseases,
    "biolink:Gene": all_genes,
}


all_figure_curies = set()
for curie_category in set(curie_category_to_curies.keys()).intersection(curie_categories):
    all_figure_curies |= curie_category_to_curies[curie_category]
all_figure_curie_count = len(all_figure_curies)

print(f'M for Fisher Exact test: {all_figure_curie_count}')
print('TODO: should this number be based on what is in BTE/TRAPI?')

total instance counts:
  chemicals: 275456
  diseases: 20465
  genes: 1369680

unique counts:
  chemicals: 14482
  diseases: 1430
  genes: 14253

M for Fisher Exact test: 28735
TODO: should this number be based on what is in BTE/TRAPI?


See [this contingency table](https://docs.google.com/spreadsheets/d/1d-48R5N0auuKXSLJ7oZ2E_ldO26Z5KiEH3AQx5kDrC4/edit#gid=0):

|      |  |
| ----------- | ----------- |
| trapi_curies_in_pwy      | q_node_id_count - trapi_curies_in_pwy       |
| curies_in_pwy - trapi_curies_in_pwy   | 28735 - curies_in_pwy - q_node_id_count + trapi_curies_in_pwy        |

and [the scipy documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html):

|      |  |
| ----------- | ----------- |
| x      | n - x       |
| N - x   | M - (n + N) + x        |

In [19]:
# trapi_curies_in_pwy
# x
results_with_figures_df["yes_pwy_yes_trapi"] = (
    results_with_figures_df["overlap_curie_combo"].map(len)
)

# q_node_id_count - trapi_curies_in_pwy
# n - x
results_with_figures_df["no_pwy_yes_trapi"] = (
    len(q_node_ids) - results_with_figures_df["yes_pwy_yes_trapi"]
)

# curies_in_pwy - trapi_curies_in_pwy
# N - x
results_with_figures_df["yes_pwy_no_trapi"] = (
    results_with_figures_df["figure_curie_combo"].map(len) - 
    results_with_figures_df["yes_pwy_yes_trapi"]
)

# 28735 - curies_in_pwy - q_node_id_count + trapi_curies_in_pwy
# M - (n + N) + x
results_with_figures_df["no_pwy_no_trapi"] = (
    all_figure_curie_count -
    results_with_figures_df["figure_curie_combo"].map(len) -
    len(q_node_ids) +
    results_with_figures_df["yes_pwy_yes_trapi"]
)

In [20]:
import scipy.stats as stats


results_with_figures_df["fishers_exact"] = results_with_figures_df[
    ["yes_pwy_yes_trapi",
     "no_pwy_yes_trapi",
     "yes_pwy_no_trapi",
     "no_pwy_no_trapi"]
].apply(
    lambda r: stats.fisher_exact([[
        r.yes_pwy_yes_trapi,
        r.no_pwy_yes_trapi
    ], [
        r.yes_pwy_no_trapi,
        r.no_pwy_no_trapi,
    ]]),
    axis=1)

results_with_figures_df["p_value"] = results_with_figures_df["fishers_exact"].apply(
    lambda x: x[1]
)

In [21]:
results_with_figures_df["p_value"].sort_values()

14000     1.572988e-08
13968     1.639452e-08
14045     1.707763e-08
13891     1.777945e-08
14007     1.850024e-08
              ...     
50680     3.258204e-04
131143    4.786686e-04
131144    4.786686e-04
131145    4.786686e-04
131146    4.786686e-04
Name: p_value, Length: 135113, dtype: float64

## Analysis of results with vs. without BTE Sleeve

With the current query graph, requiring co-occurrence of TRAPI result CURIE(s) for particular node(s) in at least one PFOCR figure cuts down the number of TRAPI result CURIE combos.

### Without BTE Sleeve

For the total number of TRAPI result CURIE combos returned, we're only
including a combo when every CURIE in it can be unified. If any of the
CURIEs in a combo cannot be unified, we drop that combo.

In [22]:
print(f'{len(set(results_df["trapi_result_curie_combo"]))} TRAPI results')
print(f'{len(figure_id_to_curies.keys())} figures')

3158 TRAPI results
77719 figures


### With BTE Sleeve
The totals when the TRAPI result CURIE(s) for specified nodes must be found in at least one figure:

In [23]:
from itertools import combinations


for i in range(2, len(q_node_ids) + 1):
    for q_node_id_combination in combinations(q_node_ids, i):
        print(f'{" & ".join(q_node_id_combination)}')
        s = pd.Series([True] * len(results_with_figures_df), index=results_with_figures_df.index)
        for q_node_id in q_node_id_combination:
            s = s & results_with_figures_df.apply(
                lambda r: r[f'{q_node_id}_unified_curie'] in r["overlap_curie_combo"], axis=1
            )
        print(f'  {len(set(results_with_figures_df[s == True]["trapi_result_curie_combo"]))} TRAPI result(s)')
        print(f'  {len(set(results_with_figures_df[s == True]["figure_id"]))} figure(s)')

n0 & n1
  2055 TRAPI result(s)
  196 figure(s)
n0 & n2
  268 TRAPI result(s)
  105 figure(s)
n1 & n2
  306 TRAPI result(s)
  2144 figure(s)
n0 & n1 & n2
  41 TRAPI result(s)
  48 figure(s)


## View TRAPI results with figures

Click a figure to get its paper.

Compare these results with the `results_df` table from earlier in this notebook and also with the [ARAX UI](https://arax.ncats.io/) (copy the query into the JSON input field and reformat to make valid JSON).

Does requiring co-occurrence for specific nodes like `n0` and `n2` help? Does ranking by p-value help?

### co-occurrence: any two nodes

In [24]:
any_two_cooccurrence_df = results_with_figures_df.sort_values(
    "p_value"
)

#### By TRAPI result

In [25]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in any_two_cooccurrence_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        # TODO: why does the commented out block below not display correctly
        # on GitHub? The figcaption appears to the right of the image.
        # The div solution below is a kludge.
        figures.append(f'''
        <div style="margin: 5px !important; padding: 5px !important;>
            <figure style="margin: 5px !important;">
                <a target="_blank" href="{paper_url}">
                  <img src="{figure_url}" style="height: {row_height}">
              </a>
            </figure>
            <div style="font-size: x-small;">
                {figure_title_limited}<br>
                p-value: {p_value:.1e}, 
                containment: {score:.2}
            </div>
        </div>
        ''')
        
#        figures.append(f'''
#            <figure style="margin: 5px !important;">
#                <a target="_blank" href="{paper_url}">
#                  <img src="{figure_url}" style="height: {row_height}">
#                </a>
#                <figcaption style="font-size: small;">
#                      {figure_title_limited}<br>
#                      p-value: {p_value:.1e}, 
#                      containment: {score:.2}
#                </figcaption>
#            </figure>
#        ''')

    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(any_two_cooccurrence_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 182 more figures



...

plus 180 more figures



...

plus 181 more figures

...

plus 2176 more TRAPI results



#### By figure

In [26]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in any_two_cooccurrence_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-style: italic; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(any_two_cooccurrence_df["figure_id"])) - limit} more figures')
        print("")
        break

...

plus 278 more TRAPI results



...

plus 279 more TRAPI results



...

plus 278 more TRAPI results

...

plus 2293 more figures



### co-occurrence: `n0` and (`n1` or `n2`)

In [27]:
n0_cooccurrence_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: r.n0_unified_curie in set(r.overlap_curie_combo),
    axis=1
)].sort_values(
    "p_value"
)

In [28]:
print(len(set(results_with_figures_df["figure_id"])))
print(len(set(n0_cooccurrence_df["figure_id"])))

2296
200


#### By TRAPI result

In [29]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in n0_cooccurrence_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        # TODO: why does the commented out block below not display correctly
        # on GitHub? The figcaption appears to the right of the image.
        # The div solution below is a kludge.
        figures.append(f'''
        <div style="margin: 5px !important; padding: 5px !important;>
            <figure style="margin: 5px !important;">
                <a target="_blank" href="{paper_url}">
                  <img src="{figure_url}" style="height: {row_height}">
              </a>
            </figure>
            <div style="font-size: x-small;">
                {figure_title_limited}<br>
                p-value: {p_value:.1e}, 
                containment: {score:.2}
            </div>
        </div>
        ''')
        
#        figures.append(f'''
#            <figure style="margin: 5px !important;">
#                <a target="_blank" href="{paper_url}">
#                  <img src="{figure_url}" style="height: {row_height}">
#                </a>
#                <figcaption style="font-size: small;">
#                      {figure_title_limited}<br>
#                      p-value: {p_value:.1e}, 
#                      containment: {score:.2}
#                </figcaption>
#            </figure>
#        ''')

    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(n0_cooccurrence_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 182 more figures



...

plus 180 more figures



...

plus 181 more figures

...

plus 2160 more TRAPI results



#### By figure

In [30]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in n0_cooccurrence_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-style: italic; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(n0_cooccurrence_df["figure_id"])) - limit} more figures')
        print("")
        break

...

plus 278 more TRAPI results



...

plus 279 more TRAPI results



...

plus 278 more TRAPI results

...

plus 197 more figures



### co-occurrence: `n0` and `n1`

In [31]:
n0_n1_cooccurrence_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: (
        r.n0_unified_curie in set(r.overlap_curie_combo)
    ) and (
        r.n1_unified_curie in set(r.overlap_curie_combo)
    ),
    axis=1
)].sort_values(
    "p_value"
)

#### By TRAPI result

In [32]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in n0_n1_cooccurrence_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        # TODO: why does the commented out block below not display correctly
        # on GitHub? The figcaption appears to the right of the image.
        # The div solution below is a kludge.
        figures.append(f'''
        <div style="margin: 5px !important; padding: 5px !important;>
            <figure style="margin: 5px !important;">
                <a target="_blank" href="{paper_url}">
                  <img src="{figure_url}" style="height: {row_height}">
              </a>
            </figure>
            <div style="font-size: x-small;">
                {figure_title_limited}<br>
                p-value: {p_value:.1e}, 
                containment: {score:.2}
            </div>
        </div>
        ''')
        
#        figures.append(f'''
#            <figure style="margin: 5px !important;">
#                <a target="_blank" href="{paper_url}">
#                  <img src="{figure_url}" style="height: {row_height}">
#                </a>
#                <figcaption style="font-size: small;">
#                      {figure_title_limited}<br>
#                      p-value: {p_value:.1e}, 
#                      containment: {score:.2}
#                </figcaption>
#            </figure>
#        ''')

    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(n0_n1_cooccurrence_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 181 more figures



...

plus 180 more figures



...

plus 181 more figures

...

plus 2052 more TRAPI results



#### By figure

In [33]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in n0_n1_cooccurrence_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-style: italic; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(n0_n1_cooccurrence_df["figure_id"])) - limit} more figures')
        print("")
        break

...

plus 275 more TRAPI results



...

plus 275 more TRAPI results



...

plus 275 more TRAPI results

...

plus 193 more figures



### co-occurrence: `n0` and `n2`

In [34]:
n0_n2_cooccurrence_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: (
        r.n0_unified_curie in set(r.overlap_curie_combo)
    ) and (
        r.n2_unified_curie in set(r.overlap_curie_combo)
    ),
    axis=1
)].sort_values(
    "p_value"
)

#### By TRAPI result

In [35]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 10

for trapi_result_curie_combo, df1 in n0_n2_cooccurrence_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]
    
    ordered_trapi_result_curie_combo = []
    for q_node_id in q_node_ids:
        ordered_trapi_result_curie_combo.append(list(set(
            df1[f"{q_node_id}_unified_curie"]
        ))[0])

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span>{", ".join(ordered_trapi_result_curie_combo)}</span> 
  <span>(cumulative containment score: {cumulative_trapi_result_curie_combo_figure_score:.2f})</span>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_result_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_result_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_result_row_count = len(trapi_result_rows)
    display(HTML(data=f'''
<div style="font-size: small;">
{"<br>".join(trapi_result_rows)}
</div>
'''))

    figures = []
    row_height='100px'
    for figure_id, df0 in df1.groupby("figure_id", sort=False):
        [pmc,filename] = figure_id.split("__")
        paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]
        figure_title_limited = textwrap.shorten(figure_title, width=45, placeholder="...")

        p_value = list(set(
            df0["p_value"]
        ))[0]
        score = list(set(df0["score"]))[0]

        # TODO: why does the commented out block below not display correctly
        # on GitHub? The figcaption appears to the right of the image.
        # The div solution below is a kludge.
        figures.append(f'''
        <div style="margin: 5px !important; padding: 5px !important;>
            <figure style="margin: 5px !important;">
                <a target="_blank" href="{paper_url}">
                    <img src="{figure_url}" style="height: {row_height}">
                </a>
            </figure>
            <div style="font-size: x-small;">
                {figure_title_limited}<br>
                p-value: {p_value:.1e}, 
                containment: {score:.2}
            </div>
        </div>
        ''')
        
#        figures.append(f'''
#            <figure style="margin: 5px !important;">
#                <a target="_blank" href="{paper_url}">
#                  <img src="{figure_url}" style="height: {row_height}">
#                </a>
#                <figcaption style="font-size: small;">
#                      {figure_title_limited}<br>
#                      p-value: {p_value:.1e}, 
#                      containment: {score:.2}
#                </figcaption>
#            </figure>
#        ''')

    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more figures")
        print("")


    count += 1
    if count >= limit:
        print("...")
        print("")
        remaining = len(set(n0_n2_cooccurrence_df["trapi_result_curie_combo"])) - limit
        print(f'plus {remaining} more TRAPI results')
        print("")
        break

...

plus 26 more figures



...

plus 3 more figures



...

plus 265 more TRAPI results



#### By figure

In [36]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 5

for figure_id, raw_df in n0_n2_cooccurrence_df.groupby("figure_id", sort=False):
    df0 = raw_df.sort_values(
        "p_value"
    )
    [pmc,filename] = figure_id.split("__")
    paper_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/"
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px; font-weight: bold;">
  {figure_title}
</div>
<figure style="margin: 5px !important;">
    <a target="_blank" href="{paper_url}">
      <img src="{figure_url}" style="max-height: 500px; max-width: 100%;">
    </a>
</figure>
'''))

    trapi_results_row_sets = []
    for trapi_result_curie_combo, df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo", "p_value", "score"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
                
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}</span>'
            )
            
        p_value = list(set(df1["p_value"]))[0]
        score = list(set(df1["score"]))[0]
        trapi_results_row_sets.append(f'''
<div>
    <span style="font-style: italic; font-size: small;">
      TRAPI result p-value: {p_value:.1e}, containment: {score:.2}
    </span><br>
    {"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more TRAPI results")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print("")
        print(f'plus {len(set(n0_n2_cooccurrence_df["figure_id"])) - limit} more figures')
        print("")
        break








...

plus 102 more figures

