In [1]:
from copy import copy
import json
import pandas as pd

# BTE Sleeve

Some queries result in an explosion of the number of results. This is bad for performance of the server and also not very useful to the user. This notebook explores using PFOCR as a "compression sleeve" to dampen these types of explosions.

<a title="Science History Institute, CC BY-SA 3.0 &lt;https://creativecommons.org/licenses/by-sa/3.0&gt;, via Wikimedia Commons" href="https://commons.wikimedia.org/wiki/File:Lymphedema_Compression_sleeve_November_2106_Second_Skin_016.jpg"><img width="512" alt="Lymphedema Compression sleeve November 2106 Second Skin 016" src="https://upload.wikimedia.org/wikipedia/commons/thumb/6/6e/Lymphedema_Compression_sleeve_November_2106_Second_Skin_016.jpg/512px-Lymphedema_Compression_sleeve_November_2106_Second_Skin_016.jpg"></a>

Let's try test query [A.2_RHOBTB2_twohop](https://github.com/NCATSTranslator/minihackathons/blob/main/2021-12_demo/workflowA/A.2_RHOBTB2_twohop.json):

`RHOBTB2 Gene`--`entity_regulates_entity`|`genetically_interacts_with`--`Any Gene`--`related_to`--`Any SmallMolecule`

`n0`--`e01`--`n1`--`e02`--`n2`

In [2]:
import requests
import requests_cache


requests_cache.install_cache("pfocr_cache", allowable_methods=['GET', 'POST'])

## Get BTE TRAPI Results

In [3]:
query = {
    "message": {
        "query_graph": {
            "edges": {
                "e01": {
                    "object": "n0",
                    "subject": "n1",
                    "predicates": [
                        "biolink:entity_regulates_entity",
                        "biolink:genetically_interacts_with"
                    ]
                },
                "e02": {
                    "object": "n1",
                    "subject": "n2",
                    "predicates": [
                        "biolink:related_to"
                    ]
                }
            },
            "nodes": {
                "n0": {
                    "ids": [
                        "NCBIGene:23221"
                    ],
                    "categories": [
                        "biolink:Gene"
                    ]
                },
                "n1": {
                    "categories": [
                        "biolink:Gene"
                    ]
                },
                "n2": {
                    "categories": [
                        "biolink:SmallMolecule"
                    ]
                }
            }
        }
    }
}

trapi_response = requests.post("http://localhost:3000/v1/query", json=query)
#trapi_response = requests.post("https://api.bte.ncats.io/v1/query", json=query)
#trapi_response = requests.post("https://arax.ncats.io/api/rtxkg2/v1.2/query", json=query)
#trapi_response = requests.post("https://arax.ncats.io/api/arax/v1.2/query", json=query)
print(trapi_response.status_code)
if trapi_response.status_code != 200:
    print(trapi_response.text)
if trapi_response.from_cache:
    print(f"got response from cache")

trapi_message = trapi_response.json()["message"]

200
got response from cache


In [4]:
# for genes/gene products, chemicals and diseases
preferred_prefixes = set(["NCBIGene", "MESH"])
unified_names = set()
unified_curies = set()
unification_failed_curies = set()
unified_prefixes = set()
all_prefixes = set()
curie_to_name = dict()
curie_to_unified_curie = dict()
for k, v in trapi_message["knowledge_graph"]["nodes"].items():
    name = v["name"]
    for a in v["attributes"]:
        if a["attribute_type_id"] == "biolink:xref":
            curies = a["value"]
            
            # k should always be one of the curies
            if not k in curies:
                raise Exception(f"key {k} not in {curies}")
            
            unified_curie = None
            intersecting_unified_curies = unified_curies.intersection(set(curies))
            if len(intersecting_unified_curies) > 1:
                raise Exception(f"matching multiple: {k} to {list(intersecting_unified_curies)}")
            elif len(intersecting_unified_curies) == 1:
                unified_curie = list(intersecting_unified_curies)[0]
            else:
                # get curie for preferred prefix. usually this is k, but not always.
                for curie in curies:
                    [prefix, identifier] = curie.split(":")
                    if prefix in preferred_prefixes:
                        unified_curie = curie
                        unified_curies.add(unified_curie)
                        break
                        
            if not unified_curie:
                if k in curie_to_unified_curie:
                    unified_curie = curie_to_unified_curie[k]
                else:
                    unification_failed_curies.add(k)
                    break
                    
                #raise Exception(f"failed to find a unified curie for {k} in {curies}")
                
            [unified_prefix, unified_identifier] = unified_curie.split(":")
            unified_prefixes.add(unified_prefix)
            
            for curie in curies:
                [prefix, identifier] = curie.split(":")
                all_prefixes.add(prefix)
                if not curie in curie_to_unified_curie:
                    curie_to_unified_curie[curie] = unified_curie
                if not curie in curie_to_name:
                    curie_to_name[curie] = name
                    unified_names.add(name)
                elif curie_to_name[curie] != name:
                    print(f"curie {curie} has multiple primary names: {curie_to_name[curie]} and {name}")
                    #raise Exception(f"curie{curie} has multiple names: {curie_to_name[curie]} and {name}")

print(f"curie_to_name key count: {len(curie_to_name.keys())}")
print(f"curie_to_unified_curie key count: {len(curie_to_unified_curie.keys())}")
print("")
print("all curie prefixes found:")
print(all_prefixes)
print("")
print("unified curie prefixes found:")
print(unified_prefixes)
print("")
print(f"failed to unify {len(unification_failed_curies)} curies")

curie_to_name key count: 13680
curie_to_unified_curie key count: 13680

all curie prefixes found:
{'HGNC', 'GTOPDB', 'OMIM', 'CAS', 'DRUGBANK', 'INCHIKEY', 'CHEBI', 'KEGG.COMPOUND', 'CHEMBL.COMPOUND', 'MESH', 'UNII', 'PR', 'DrugCentral', 'ENSEMBL', 'UMLS', 'NCBIGene', 'PUBCHEM.COMPOUND', 'HMDB', 'UniProtKB'}

unified curie prefixes found:
{'MESH', 'NCBIGene'}

failed to unify 446 curies


In [5]:
columns = []
q_node_id_keys = set(["object", "subject"])
q_node_ids = []
q_edge_ids = []
for q_edge_id,edge_v in query["message"]["query_graph"]["edges"].items():
    q_edge_ids.append(q_edge_id)
    q_node_id_found = False
    for k,v in edge_v.items():
        if (k in q_node_id_keys) and (type(v) is str):
            if v not in columns:
                q_node_ids.append(v)
                columns.append(v)
            if not q_node_id_found:
                q_node_id_found = True
                columns.append(q_edge_id)
                
for q_node_id in q_node_ids:
    columns.append(f"{q_node_id}_curie")
    columns.append(f"{q_node_id}_unified_curie")
    
columns.append("trapi_result_curie_combo")
    
print(columns)
print(q_node_ids)
print(q_edge_ids)

['n0', 'e01', 'n1', 'e02', 'n2', 'n0_curie', 'n0_unified_curie', 'n1_curie', 'n1_unified_curie', 'n2_curie', 'n2_unified_curie', 'trapi_result_curie_combo']
['n0', 'n1', 'n2']
['e01', 'e02']


In [6]:
trapi_result_columns = []
for i in range(len(q_node_ids)):
    trapi_result_columns.append(q_node_ids[i])
    if i < len(q_edge_ids):
        trapi_result_columns.append(q_edge_ids[i])
print(trapi_result_columns)

['n0', 'e01', 'n1', 'e02', 'n2']


In [7]:
from copy import deepcopy


trapi_results = trapi_message["results"]

result_row_data = []
for trapi_result in trapi_results:
    curie_to_qnode_ids = dict()
    for qnode_id, entries in trapi_result["node_bindings"].items():
        for entry in entries:
            curie = entry["id"]
            if curie not in curie_to_qnode_ids:
                curie_to_qnode_ids[curie] = []
            curie_to_qnode_ids[curie].append(qnode_id)
        
    row_data_template = dict()
    q_edge_id_to_predicates = dict()
    trapi_result_curies = set()
    for qedge_id, entries in trapi_result["edge_bindings"].items():
        for entry in entries:
            curie = entry["id"]
            kg_entry = trapi_message["knowledge_graph"]["edges"][curie]
            subject_curie = kg_entry["subject"]
            object_curie = kg_entry["object"]
            predicate_curie = kg_entry["predicate"]
            [predicate_prefix, predicate_identifier] = predicate_curie.split(":")
            
            if qedge_id not in q_edge_id_to_predicates:
                q_edge_id_to_predicates[qedge_id] = set()
            q_edge_id_to_predicates[qedge_id].add(predicate_identifier)

            for curie in [subject_curie, object_curie]:
                for qnode_id in curie_to_qnode_ids[curie]:
                    if curie in curie_to_unified_curie:
                        unified_curie = curie_to_unified_curie[curie]
                    else:
                        break

                    name = curie_to_name[curie]
                    row_data_template[qnode_id] = name

                    trapi_result_curies.add(unified_curie)
                    row_data_template[qnode_id + "_curie"] = curie
                    row_data_template[qnode_id + "_unified_curie"] = unified_curie
                    
    if len(trapi_result_curies) != len(q_node_ids):
        #print(f'skipping {list(curie_to_qnode_ids.keys())}')
        continue
        
    row_data_template["trapi_result_curie_combo"] = tuple(sorted(trapi_result_curies))
    q_edge_ids_processed = set()
    row_datas = [row_data_template]
    for q_edge_id,predicates in q_edge_id_to_predicates.items():
        next_row_datas = []
        for row_data in row_datas:
            for predicate in predicates:
                next_row_data = deepcopy(row_data)
                next_row_data[q_edge_id] = predicate
                next_row_datas.append(
                    next_row_data 
                )
        row_datas = next_row_datas
    result_row_data += row_datas
        
print("warning: predicate direction(s) may be switched")
results_df = pd.DataFrame.from_records(result_row_data, columns=columns).drop_duplicates()
results_df



Unnamed: 0,n0,e01,n1,e02,n2,n0_curie,n0_unified_curie,n1_curie,n1_unified_curie,n2_curie,n2_unified_curie,trapi_result_curie_combo
0,RHOBTB2,entity_negatively_regulates_entity,STK11,interacts_with,4-Hydroxy-3-methoxycinnamaldehyde,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:5280536,MESH:C075384,"(MESH:C075384, NCBIGene:23221, NCBIGene:6794)"
1,RHOBTB2,entity_negatively_regulates_entity,STK11,coexists_with,Ditiomustine,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:127547,MESH:C033713,"(MESH:C033713, NCBIGene:23221, NCBIGene:6794)"
2,RHOBTB2,entity_negatively_regulates_entity,STK11,interacts_with,Benzene Derivatives,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,UMLS:C0005037,MESH:D001555,"(MESH:D001555, NCBIGene:23221, NCBIGene:6794)"
3,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_negatively_regulates_entity,Puerarin,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:5281807,MESH:C033607,"(MESH:C033607, NCBIGene:23221, NCBIGene:6794)"
4,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_positively_regulates_entity,"3-Chloro-1,2-propanediol",NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:7290,MESH:D000517,"(MESH:D000517, NCBIGene:23221, NCBIGene:6794)"
...,...,...,...,...,...,...,...,...,...,...,...,...
5271,RHOBTB2,genetically_interacts_with,CUL3,interacts_with,Hydrogen peroxide,NCBIGene:23221,NCBIGene:23221,NCBIGene:8452,NCBIGene:8452,PUBCHEM.COMPOUND:784,MESH:D006861,"(MESH:D006861, NCBIGene:23221, NCBIGene:8452)"
5272,RHOBTB2,genetically_interacts_with,CUL3,entity_negatively_regulated_by_entity,MicroRNAs,NCBIGene:23221,NCBIGene:23221,NCBIGene:8452,NCBIGene:8452,UMLS:C1101610,MESH:D035683,"(MESH:D035683, NCBIGene:23221, NCBIGene:8452)"
5273,RHOBTB2,genetically_interacts_with,CUL3,entity_positively_regulated_by_entity,Eloxatin (TN),NCBIGene:23221,NCBIGene:23221,NCBIGene:8452,NCBIGene:8452,PUBCHEM.COMPOUND:9887054,MESH:C030110,"(MESH:C030110, NCBIGene:23221, NCBIGene:8452)"
5274,RHOBTB2,genetically_interacts_with,CUL3,decreases_expression_of,Folic acid,NCBIGene:23221,NCBIGene:23221,NCBIGene:8452,NCBIGene:8452,PUBCHEM.COMPOUND:135398658,MESH:D005492,"(MESH:D005492, NCBIGene:23221, NCBIGene:8452)"


In [8]:
for q_node_id in q_node_ids:
    print(len(set(results_df[q_node_id])))

1
10
1108


In [9]:
len(set(results_df["trapi_result_curie_combo"]))

3738

Total results count (excluding any that couldn't be unified): 3,738. That's too many for a researcher to efficiently go through manually, so let's try filtering to make that more manageable.

## Match up BTE TRAPI Results & PFOCR

We're going to try using PFOCR to filter and prioritize the results.

First we need to get the PFOCR Data. We could get it from the API, but for now, we'll just go ahead and download the entire JSON file we gave to BTE.

In [10]:
pfocr_url = "https://www.dropbox.com/s/1f14t5zaseocyg6/bte_chemicals_diseases_genes.ndjson?dl=1"
pfocr_request = requests.get(pfocr_url)
print(f"status_code: {pfocr_request.status_code}")
if pfocr_request.status_code != 200:
    print(pfocr_request.text)

status_code: 200


In [11]:
curies_to_figure_ids = {}
figure_id_to_curies = {}
figure_id_to_pfocr_result = {}
for line in pfocr_request.text.splitlines():
    pfocr_result = json.loads(line)
    figure_id = pfocr_result["_id"]

    curies = set()
    for identifier in pfocr_result["associatedWith"]["mentions"]["chemicals"]["mesh"]:
        curie = "MESH:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)
    for identifier in pfocr_result["associatedWith"]["mentions"]["diseases"]["mesh"]:
        curie = "MESH:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)
    for identifier in pfocr_result["associatedWith"]["mentions"]["genes"]["ncbigene"]:
        curie = "NCBIGene:" + identifier
        if curie in curie_to_unified_curie:
            unified_curie = curie_to_unified_curie[curie]
            curies.add(unified_curie)
        else:
            #print(f"{curie} not in list")
            # this curie isn't in the BTE results, but we'll add an item for
            # the purpose of the denominator in jaccard/containment cals
            curies.add(curie)

    figure_id_to_pfocr_result[figure_id] = pfocr_result
    figure_id_to_curies[figure_id] = curies

    curies_key = tuple(sorted(curies))
    if curies_key not in curies_to_figure_ids:
        curies_to_figure_ids[curies_key] = []
    curies_to_figure_ids[curies_key].append(figure_id)

In [12]:
from SetSimilaritySearch import SearchIndex

# the reference sets are supposed to be from PFOCR only
reference_sets = list()
for curies in set(curies_to_figure_ids.keys()):
    reference_sets.append(set(curies))
# but in order to calculate the scores correctly,
# the SetSimilaritySearch library requires that
# every curie from the bte results need to be
# mentioned at least once in the reference set.
# That's the only reason we add them below, as
# one large set.

unified_curie_columns = [
    q_node_id + "_unified_curie" for q_node_id in q_node_ids
]

trapi_result_curie_combos = set()
for i, df in results_df[unified_curie_columns].drop_duplicates().iterrows():
    trapi_result_curies = []
    for unified_curie_column in unified_curie_columns:
        trapi_result_curies.append(
            df[unified_curie_column]
        )
    trapi_result_curie_combos.add(tuple(sorted(
        trapi_result_curies
    )))
    
reference_set = set()
for trapi_result_curie_combo in trapi_result_curie_combos:
    reference_set |= set(trapi_result_curie_combo)
reference_sets.append(reference_set)

matchable_node_min = 2
matchable_node_count = len(q_node_ids)
index = SearchIndex(reference_sets, similarity_func_name="containment", 
    similarity_threshold=matchable_node_min/matchable_node_count)

trapi_figure_overlap_rows = []
for trapi_result_curie_combo in trapi_result_curie_combos:
    trapi_curies = set(trapi_result_curie_combo)
    results = index.query(trapi_curies)
    for result in results:
        figure_curie_combos = reference_sets[result[0]]
        figure_curie_key = tuple(sorted(figure_curie_combos))
        
        # needed to not match bte results to themselves
        if figure_curie_key in curies_to_figure_ids:
            figure_ids = curies_to_figure_ids[figure_curie_key]
        else:
            continue
            
        score = result[1]
        
        common_curies = trapi_curies.intersection(figure_curie_combos)
        for figure_id in figure_ids:
            pfocr_result = figure_id_to_pfocr_result[figure_id]
            trapi_figure_overlap_rows.append({
                "figure_id": figure_id,
                "figure_url": pfocr_result["associatedWith"]["figureUrl"],
                "figure_title": pfocr_result["associatedWith"]["title"],
                "trapi_result_curie_combo": trapi_result_curie_combo,
                "figure_curie_combo": figure_curie_key,
                "overlap_curie_combo": tuple(sorted(common_curies)),
                "score": score,
            })


trapi_figure_overlap_df = pd.DataFrame.from_records(trapi_figure_overlap_rows)
trapi_figure_overlap_df

Unnamed: 0,figure_id,figure_url,figure_title,trapi_result_curie_combo,figure_curie_combo,overlap_curie_combo,score
0,PMC3653120__fonc-03-00119-g004.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Effect of physical activity and obesity on the...,"(MESH:D005492, NCBIGene:23221, NCBIGene:894)","(MESH:C045651, MESH:C440975, MESH:D003474, MES...","(MESH:D005492, NCBIGene:894)",0.666667
1,PMC5980113__JCMM-22-3108-g006.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,Role of MICAL1 in breast cancer cell prolifera...,"(MESH:D005472, NCBIGene:23221, NCBIGene:595)","(MESH:C085911, MESH:C086501, MESH:D005472, NCB...","(MESH:D005472, NCBIGene:595)",0.666667
2,PMC4069315__WJG-20-7878-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,Mechanisms of action of lactic acid bacteria v...,"(MESH:D005472, NCBIGene:23221, NCBIGene:595)","(MESH:D001930, MESH:D005472, NCBIGene:10050674...","(MESH:D005472, NCBIGene:595)",0.666667
3,PMC7642974__fphys-11-01022-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Action and classification of E3 ubiquitin liga...,"(MESH:C072553, NCBIGene:23221, NCBIGene:8452)","(MESH:C017937, MESH:C039244, MESH:C119604, MES...","(NCBIGene:23221, NCBIGene:8452)",0.666667
4,PMC7809845__13293_2020_343_Fig4_HTML.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Prolactin signaling pathway (KEGG ID: hsa04917),"(MESH:D004298, NCBIGene:23221, NCBIGene:894)","(MESH:D000255, MESH:D004298, MESH:D004967, MES...","(MESH:D004298, NCBIGene:894)",0.666667
...,...,...,...,...,...,...,...
2167,PMC7642974__fphys-11-01022-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Action and classification of E3 ubiquitin liga...,"(MESH:D005472, NCBIGene:23221, NCBIGene:8452)","(MESH:C017937, MESH:C039244, MESH:C119604, MES...","(NCBIGene:23221, NCBIGene:8452)",0.666667
2168,PMC4703318__nihms587579f1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4...,Molecular Pathways: Turning Proteasomal Protei...,"(MESH:C400082, NCBIGene:23221, NCBIGene:894)","(MESH:C000593797, MESH:C400082, MESH:C475865, ...","(MESH:C400082, NCBIGene:894)",0.666667
2169,PMC7642974__fphys-11-01022-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Action and classification of E3 ubiquitin liga...,"(MESH:C552719, NCBIGene:23221, NCBIGene:8452)","(MESH:C017937, MESH:C039244, MESH:C119604, MES...","(NCBIGene:23221, NCBIGene:8452)",0.666667
2170,PMC7642974__fphys-11-01022-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Action and classification of E3 ubiquitin liga...,"(MESH:D012964, NCBIGene:23221, NCBIGene:8452)","(MESH:C017937, MESH:C039244, MESH:C119604, MES...","(NCBIGene:23221, NCBIGene:8452)",0.666667


In [13]:
print(len(set(
    trapi_figure_overlap_df[trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= 2]["trapi_result_curie_combo"]
)))
print(len(set(
    trapi_figure_overlap_df[trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= 3]["trapi_result_curie_combo"]
)))
print(len(set(
    trapi_figure_overlap_df[trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= 4]["trapi_result_curie_combo"]
)))
print(len(set(trapi_figure_overlap_df["figure_id"])))

887
0
0
695


### trapi_curie_combo_count_by_figure_df

In [14]:
trapi_curie_combo_count_by_figure_df = trapi_figure_overlap_df[["figure_id", "trapi_result_curie_combo"]].rename(columns={
    "trapi_result_curie_combo": "trapi_curie_combo_count"
}).groupby("figure_id").count().sort_values(
    by="trapi_curie_combo_count", ascending=False)
trapi_curie_combo_count_by_figure_df

Unnamed: 0_level_0,trapi_curie_combo_count
figure_id,Unnamed: 1_level_1
PMC6089851__gr1.jpg,326
PMC7642974__fphys-11-01022-g001.jpg,174
PMC4706528__cbm-12-04-342-f1.jpg,30
PMC5820294__fonc-08-00038-g001.jpg,21
PMC2861144__nihms195501f1.jpg,19
...,...
PMC5550862__cddis2017315f8.jpg,1
PMC5546324__nihms866088f6.jpg,1
PMC3811010__nihms-496184-f0004.jpg,1
PMC5543040__fendo-08-00187-g001.jpg,1


### figure count/score by trapi curie combo

In [15]:
trapi_figure_overlap_df[["trapi_result_curie_combo", "figure_id"]].rename(columns={
    "figure_id": "figure_count"
}).groupby("trapi_result_curie_combo").count().sort_values(
    by="figure_count", ascending=False).head(10)

Unnamed: 0_level_0,figure_count
trapi_result_curie_combo,Unnamed: 1_level_1
"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)",123
"(MESH:D000255, NCBIGene:23221, NCBIGene:6794)",107
"(MESH:D008687, NCBIGene:23221, NCBIGene:6794)",57
"(MESH:D020123, NCBIGene:23221, NCBIGene:6794)",51
"(NCBIGene:23221, NCBIGene:5111, NCBIGene:595)",42
"(MESH:D009243, NCBIGene:23221, NCBIGene:6794)",42
"(MESH:D002118, NCBIGene:23221, NCBIGene:595)",40
"(MESH:D002118, NCBIGene:23221, NCBIGene:6794)",33
"(MESH:D012694, NCBIGene:23221, NCBIGene:595)",31
"(MESH:D012694, NCBIGene:23221, NCBIGene:6794)",23


In [16]:
trapi_result_figure_score_df = trapi_figure_overlap_df[["trapi_result_curie_combo", "score"]].rename(
    columns={"score": "cumulative_trapi_result_curie_combo_figure_score"}
).groupby("trapi_result_curie_combo").sum().sort_values(
    by="cumulative_trapi_result_curie_combo_figure_score", ascending=False)
trapi_result_figure_score_df.head(10)

Unnamed: 0_level_0,cumulative_trapi_result_curie_combo_figure_score
trapi_result_curie_combo,Unnamed: 1_level_1
"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)",82.0
"(MESH:D000255, NCBIGene:23221, NCBIGene:6794)",71.333333
"(MESH:D008687, NCBIGene:23221, NCBIGene:6794)",38.0
"(MESH:D020123, NCBIGene:23221, NCBIGene:6794)",34.0
"(NCBIGene:23221, NCBIGene:5111, NCBIGene:595)",28.0
"(MESH:D009243, NCBIGene:23221, NCBIGene:6794)",28.0
"(MESH:D002118, NCBIGene:23221, NCBIGene:595)",26.666667
"(MESH:D002118, NCBIGene:23221, NCBIGene:6794)",22.0
"(MESH:D012694, NCBIGene:23221, NCBIGene:595)",20.666667
"(MESH:D012694, NCBIGene:23221, NCBIGene:6794)",15.333333


In [17]:
results_with_figures_df = trapi_figure_overlap_df.merge(
    results_df,
    on="trapi_result_curie_combo",
    how="left").merge(
    trapi_result_figure_score_df.reset_index(),
    on="trapi_result_curie_combo",
    how="left",
    validate="many_to_one"
).merge(
    trapi_curie_combo_count_by_figure_df.reset_index().rename(columns={
        "trapi_curie_combo_count": "trapi_curie_combo_count_by_figure"
    }),
    on="figure_id",
    how="left",
    validate="many_to_one"
).sort_values(
    by="cumulative_trapi_result_curie_combo_figure_score", ascending=False
)

results_with_figures_df

Unnamed: 0,figure_id,figure_url,figure_title,trapi_result_curie_combo,figure_curie_combo,overlap_curie_combo,score,n0,e01,n1,e02,n2,n0_curie,n0_unified_curie,n1_curie,n1_unified_curie,n2_curie,n2_unified_curie,cumulative_trapi_result_curie_combo_figure_score,trapi_curie_combo_count_by_figure
2174,PMC3536343__cshperspect-TRA-012252_F1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,MTORC1 signaling to the translational machinery,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C039301, MESH:D000249, MESH:D000255, MES...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_positively_regulates_entity,Adenosine monophosphate,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2
1971,PMC6304530__BMRI2018-5837235.001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,The reprogramming of metabolism in cancer,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C000589078, MESH:C011080, MESH:C038361, ...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_negatively_regulates_entity,Adenosine monophosphate,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,6
1962,PMC7998595__jcm-10-01184-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Autophagy pathway in podocyte,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C530429, MESH:D000186, MESH:D000249, MES...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_negatively_regulates_entity,Adenosine monophosphate,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2
1963,PMC7998595__jcm-10-01184-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Autophagy pathway in podocyte,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C530429, MESH:D000186, MESH:D000249, MES...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,correlated_with,Adenosine monophosphate,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2
1964,PMC7998595__jcm-10-01184-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Autophagy pathway in podocyte,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C530429, MESH:D000186, MESH:D000249, MES...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,entity_positively_regulates_entity,Adenosine monophosphate,NCBIGene:23221,NCBIGene:23221,NCBIGene:6794,NCBIGene:6794,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3589,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C531198, NCBIGene:23221, NCBIGene:388)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:23221, NCBIGene:388)",0.666667,RHOBTB2,entity_negatively_regulates_entity,RHOB,resistance_associated_with,Dactolisib,NCBIGene:23221,NCBIGene:23221,NCBIGene:388,NCBIGene:388,PUBCHEM.COMPOUND:11977753,MESH:C531198,0.666667,326
3590,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C531198, NCBIGene:23221, NCBIGene:388)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:23221, NCBIGene:388)",0.666667,RHOBTB2,entity_negatively_regulates_entity,RHOB,sensitivity_associated_with,Dactolisib,NCBIGene:23221,NCBIGene:23221,NCBIGene:388,NCBIGene:388,PUBCHEM.COMPOUND:11977753,MESH:C531198,0.666667,326
3591,PMC3618522__nihms433145f1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Molecular Pathways: Current Role and Future Di...,"(MESH:D014801, NCBIGene:23221, NCBIGene:595)","(MESH:C059630, MESH:D014212, MESH:D014801, NCB...","(MESH:D014801, NCBIGene:595)",0.666667,RHOBTB2,entity_negatively_regulates_entity,CCND1,entity_negatively_regulates_entity,Retinol,NCBIGene:23221,NCBIGene:23221,NCBIGene:595,NCBIGene:595,PUBCHEM.COMPOUND:445354,MESH:D014801,0.666667,2
1769,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C000710356, NCBIGene:23221, NCBIGene:388)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:23221, NCBIGene:388)",0.666667,RHOBTB2,entity_negatively_regulates_entity,RHOB,resistance_associated_with,1-[4-Amino-7-(3-hydroxypropyl)-5-(4-methylphen...,NCBIGene:23221,NCBIGene:23221,NCBIGene:388,NCBIGene:388,PUBCHEM.COMPOUND:644243,MESH:C000710356,0.666667,326


## Analysis of results with vs. without Sleeve

With the current query graph, requiring co-occurrence of at least 2 nodes in at least one PFOCR figure cuts the number of TRAPI result curie combos down to 887 from 3,738.

When requiring 3 nodes, it went down even further to 0.

Total number of curie combos returned:

In [18]:
len(results_df["trapi_result_curie_combo"].drop_duplicates())

3738

TRAPI result curie combos where at least `x` curies are also found in at least one figure:

In [19]:
for i in range(2, len(q_node_ids) + 1):
    curie_combo_count = len(set(
        trapi_figure_overlap_df[
            trapi_figure_overlap_df["overlap_curie_combo"].map(len) >= i
        ]["trapi_result_curie_combo"]
    ))
    print(f'when {i}+ overlapping curies: {curie_combo_count}')

when 2+ overlapping curies: 887
when 3+ overlapping curies: 0


curie combos where curies for each pair of nodes are found in at least one figure:

In [20]:
from itertools import combinations


overlap_columns = results_with_figures_df["overlap_curie_combo"].apply(pd.Series)

for (a, b) in combinations(q_node_ids, 2):
    overlap_df = results_with_figures_df[((
        results_with_figures_df[f"{a}_unified_curie"] == overlap_columns[0]
    ) | (
        results_with_figures_df[f"{a}_unified_curie"] == overlap_columns[1]
    )) & ((
        results_with_figures_df[f"{b}_unified_curie"] == overlap_columns[0]
    ) | (
        results_with_figures_df[f"{b}_unified_curie"] == overlap_columns[1]
    ))]
    print(f'{a} & {b}: {len(overlap_df["trapi_result_curie_combo"].drop_duplicates())}')

n0 & n1: 491
n0 & n2: 9
n1 & n2: 417


## Fisher's Exact Test

In [22]:
import scipy.stats as stats


all_curies = (
    set(
        results_with_figures_df["n0_unified_curie"]
    ) | set(
        results_with_figures_df["n1_unified_curie"]
    ) | set(
        results_with_figures_df["n2_unified_curie"]
    )
)
for figure_curie_combo in set(results_with_figures_df["figure_curie_combo"]):
    figure_curie_set = set(figure_curie_combo)
    all_curies |= figure_curie_set
print(len(all_curies))

5083


In [23]:
results_with_figures_df["yes_pwy_yes_trapi"] = (
    results_with_figures_df["overlap_curie_combo"].map(len)
)
results_with_figures_df["yes_pwy_no_trapi"] = (
    results_with_figures_df["figure_curie_combo"].map(len) - 
    results_with_figures_df["overlap_curie_combo"].map(len)
)
results_with_figures_df["no_pwy_yes_trapi"] = (
    len(q_node_ids) - results_with_figures_df["overlap_curie_combo"].map(len)
)
results_with_figures_df["no_pwy_no_trapi"] = (
    len(all_curies) -
    results_with_figures_df["yes_pwy_yes_trapi"] -
    results_with_figures_df["yes_pwy_no_trapi"] -
    results_with_figures_df["no_pwy_yes_trapi"]
)

2174    2
1971    2
1962    2
1963    2
1964    2
       ..
3589    2
3590    2
3591    2
1769    2
0       2
Name: yes_pwy_yes_trapi, Length: 4348, dtype: int64

In [27]:
results_with_figures_df["fishers_exact"] = results_with_figures_df[
    ["yes_pwy_yes_trapi",
     "yes_pwy_no_trapi",
     "no_pwy_yes_trapi",
     "no_pwy_no_trapi"]
].apply(
    lambda r: stats.fisher_exact([[
        r.yes_pwy_yes_trapi,
        r.yes_pwy_no_trapi
    ], [
        r.no_pwy_yes_trapi,
        r.no_pwy_no_trapi,
    ]]),
    axis=1)
results_with_figures_df["fishers_exact_p"] = results_with_figures_df["fishers_exact"].apply(
    lambda x: x[1]
)

In [28]:
results_with_figures_df["fishers_exact_p"].sort_values()

3296    6.967240e-07
3297    3.482248e-06
4252    3.482248e-06
3598    3.482248e-06
3599    3.482248e-06
            ...     
1961    1.469492e-02
2916    1.469492e-02
4088    1.469492e-02
1960    1.469492e-02
355     1.469492e-02
Name: fishers_exact_p, Length: 4348, dtype: float64

In [42]:
results_with_figures_df

Unnamed: 0,figure_id,figure_url,figure_title,trapi_result_curie_combo,figure_curie_combo,overlap_curie_combo,score,n0,e01,n1,...,n2_curie,n2_unified_curie,cumulative_trapi_result_curie_combo_figure_score,trapi_curie_combo_count_by_figure,yes_pwy_yes_trapi,yes_pwy_no_trapi,no_pwy_yes_trapi,no_pwy_no_trapi,fishers_exact,fishers_exact_p
2174,PMC3536343__cshperspect-TRA-012252_F1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,MTORC1 signaling to the translational machinery,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C039301, MESH:D000249, MESH:D000255, MES...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,...,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2,2,51,1,5029,"(197.2156862745098, 0.00031792875353329445)",0.000318
1971,PMC6304530__BMRI2018-5837235.001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,The reprogramming of metabolism in cancer,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C000589078, MESH:C011080, MESH:C038361, ...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,...,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,6,2,88,1,4992,"(113.45454545454545, 0.0009195076335945751)",0.000920
1962,PMC7998595__jcm-10-01184-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Autophagy pathway in podocyte,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C530429, MESH:D000186, MESH:D000249, MES...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,...,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2,2,75,1,5005,"(133.46666666666667, 0.0006729393497509607)",0.000673
1963,PMC7998595__jcm-10-01184-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Autophagy pathway in podocyte,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C530429, MESH:D000186, MESH:D000249, MES...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,...,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2,2,75,1,5005,"(133.46666666666667, 0.0006729393497509607)",0.000673
1964,PMC7998595__jcm-10-01184-g001.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,Autophagy pathway in podocyte,"(MESH:D000249, NCBIGene:23221, NCBIGene:6794)","(MESH:C530429, MESH:D000186, MESH:D000249, MES...","(MESH:D000249, NCBIGene:6794)",0.666667,RHOBTB2,entity_negatively_regulates_entity,STK11,...,PUBCHEM.COMPOUND:6083,MESH:D000249,82.000000,2,2,75,1,5005,"(133.46666666666667, 0.0006729393497509607)",0.000673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3589,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C531198, NCBIGene:23221, NCBIGene:388)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:23221, NCBIGene:388)",0.666667,RHOBTB2,entity_negatively_regulates_entity,RHOB,...,PUBCHEM.COMPOUND:11977753,MESH:C531198,0.666667,326,2,153,1,4927,"(64.40522875816994, 0.0027165132937553233)",0.002717
3590,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C531198, NCBIGene:23221, NCBIGene:388)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:23221, NCBIGene:388)",0.666667,RHOBTB2,entity_negatively_regulates_entity,RHOB,...,PUBCHEM.COMPOUND:11977753,MESH:C531198,0.666667,326,2,153,1,4927,"(64.40522875816994, 0.0027165132937553233)",0.002717
3591,PMC3618522__nihms433145f1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,Molecular Pathways: Current Role and Future Di...,"(MESH:D014801, NCBIGene:23221, NCBIGene:595)","(MESH:C059630, MESH:D014212, MESH:D014801, NCB...","(MESH:D014801, NCBIGene:595)",0.666667,RHOBTB2,entity_negatively_regulates_entity,CCND1,...,PUBCHEM.COMPOUND:445354,MESH:D014801,0.666667,2,2,15,1,5065,"(675.3333333333334, 3.152679363284944e-05)",0.000032
1769,PMC6089851__gr1.jpg,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6...,Major RAS family numbers,"(MESH:C000710356, NCBIGene:23221, NCBIGene:388)","(MESH:C040523, MESH:C052497, MESH:C535729, MES...","(NCBIGene:23221, NCBIGene:388)",0.666667,RHOBTB2,entity_negatively_regulates_entity,RHOB,...,PUBCHEM.COMPOUND:644243,MESH:C000710356,0.666667,326,2,153,1,4927,"(64.40522875816994, 0.0027165132937553233)",0.002717


## View figures and TRAPI result curie combos

### By lowest p-value

In [30]:
subset_df = results_with_figures_df.sort_values(
    "fishers_exact_p"
)

#### By TRAPI result curie combo

In [31]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 20

for trapi_result_curie_combo,df1 in subset_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span style="font-weight: bold;">cumulative score: {cumulative_trapi_result_curie_combo_figure_score:.2f}</span>
  {", ".join(trapi_curie_names)}
  <span style="font-size: x-small;">({", ".join(trapi_result_curie_combo)})</span><br>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_results_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_results_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_results_row_count = len(trapi_results_rows)
    display(HTML(data=f'''
<div style="font-size: x-small;">
{"<br>".join(trapi_results_rows[:display_count_limit])}
</div>
'''))
    if trapi_results_row_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_count - display_count_limit} more")
    print("")

    figures = []
    row_height='100px'
    for figure_id,df0 in df1.groupby("figure_id"):
        [pmc,filename] = figure_id.split("__")
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]

        score = list(set(df0["score"]))[0]

        figures.append(f'''
            <figure style="margin: 5px !important;">
              <img src="{figure_url}" style="height: {row_height}">
            </figure>
        ''')
    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more")


    print("")
    count += 1
    if count >= limit:
        print("...")
        remaining = len(set(results_with_figures_df["trapi_result_curie_combo"])) - limit
        print(f'{remaining} more TRAPI result curie combos')
        break





























...
882 more TRAPI result curie combos


#### By figure

In [32]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 10

for figure_id,df0 in subset_df.groupby("figure_id", sort=False):
    print(figure_id)
    [pmc,filename] = figure_id.split("__")
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  {figure_title}<br>
  <span style="font-size: x-small;">{figure_id}</span>
</div>
'''))
    display(Image(url=figure_url, width=600))

    trapi_results_row_sets = []
    for trapi_result_curie_combo,df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}'
            )
        trapi_results_row_sets.append(f'''
<div style="font-size: x-small;">
{"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print(f'{len(set(results_with_figures_df["figure_id"])) - limit} more figures')
        break

PMC6536957__CAM4-8-2252-g001.jpg



PMC4450261__BMRI2015-597134.001.jpg



PMC4911202__BMB-48-609-g002.jpg



...
692 more figures


### `n0` pinned

In [33]:
subset_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: r.n0_unified_curie in set(r.overlap_curie_combo),
    axis=1
)].sort_values(
    "fishers_exact_p"
)

#### By TRAPI result curie combo

In [34]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 20

for trapi_result_curie_combo,df1 in subset_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span style="font-weight: bold;">cumulative score: {cumulative_trapi_result_curie_combo_figure_score:.2f}</span>
  {", ".join(trapi_curie_names)}
  <span style="font-size: x-small;">({", ".join(trapi_result_curie_combo)})</span><br>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_results_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_results_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_results_row_count = len(trapi_results_rows)
    display(HTML(data=f'''
<div style="font-size: x-small;">
{"<br>".join(trapi_results_rows[:display_count_limit])}
</div>
'''))
    if trapi_results_row_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_count - display_count_limit} more")
    print("")

    figures = []
    row_height='100px'
    for figure_id,df0 in df1.groupby("figure_id"):
        [pmc,filename] = figure_id.split("__")
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]

        score = list(set(df0["score"]))[0]

        figures.append(f'''
            <figure style="margin: 5px !important;">
              <img src="{figure_url}" style="height: {row_height}">
            </figure>
        ''')
    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more")


    print("")
    count += 1
    if count >= limit:
        print("...")
        print(f'{len(set(subset_df["trapi_result_curie_combo"])) - limit} more TRAPI result curie combos')
        break





























...
495 more TRAPI result curie combos


#### By figure

In [35]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 10

for figure_id,df0 in results_with_figures_df[results_with_figures_df.apply(
    lambda r: r.n0_unified_curie in set(r.overlap_curie_combo),
    axis=1
)].groupby("figure_id", sort=False):
    [pmc,filename] = figure_id.split("__")
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  {figure_title}<br>
  <span style="font-size: x-small;">{figure_id}</span>
</div>
'''))
    display(Image(url=figure_url, width=600))

    trapi_results_row_sets = []
    for trapi_result_curie_combo,df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}'
            )
        trapi_results_row_sets.append(f'''
<div style="font-size: x-small;">
{"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print(f'{len(set(results_with_figures_df["figure_id"])) - limit} more figures')
        break

...

plus 164 more



...

plus 316 more



### `n0` and `n1` pinned

In [36]:
subset_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: (
        r.n0_unified_curie in set(r.overlap_curie_combo)
    ) and (
        r.n1_unified_curie in set(r.overlap_curie_combo)
    ),
    axis=1
)].sort_values(
    "fishers_exact_p"
)

#### By TRAPI result curie combo

In [37]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 20

for trapi_result_curie_combo,df1 in subset_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span style="font-weight: bold;">cumulative score: {cumulative_trapi_result_curie_combo_figure_score:.2f}</span>
  {", ".join(trapi_curie_names)}
  <span style="font-size: x-small;">({", ".join(trapi_result_curie_combo)})</span><br>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_results_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_results_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_results_row_count = len(trapi_results_rows)
    display(HTML(data=f'''
<div style="font-size: x-small;">
{"<br>".join(trapi_results_rows[:display_count_limit])}
</div>
'''))
    if trapi_results_row_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_count - display_count_limit} more")
    print("")

    figures = []
    row_height='100px'
    for figure_id,df0 in df1.groupby("figure_id"):
        [pmc,filename] = figure_id.split("__")
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]

        score = list(set(df0["score"]))[0]

        figures.append(f'''
            <figure style="margin: 5px !important;">
              <img src="{figure_url}" style="height: {row_height}">
            </figure>
        ''')
    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more")


    print("")
    count += 1
    if count >= limit:
        print("...")
        print(f'{len(set(subset_df["trapi_result_curie_combo"])) - limit} more TRAPI result curie combos')
        break





























...
486 more TRAPI result curie combos


#### By figure

In [38]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 10

for figure_id,df0 in subset_df.groupby("figure_id", sort=False):
    [pmc,filename] = figure_id.split("__")
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  {figure_title}<br>
  <span style="font-size: x-small;">{figure_id}</span>
</div>
'''))
    display(Image(url=figure_url, width=600))

    trapi_results_row_sets = []
    for trapi_result_curie_combo,df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}'
            )
        trapi_results_row_sets.append(f'''
<div style="font-size: x-small;">
{"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print(f'{len(set(subset_df["figure_id"])) - limit} more figures')
        break

...

plus 155 more



...

plus 316 more



### `n0` and `n2` pinned

In [39]:
subset_df = results_with_figures_df[results_with_figures_df.apply(
    lambda r: (
        r.n0_unified_curie in set(r.overlap_curie_combo)
    ) and (
        r.n2_unified_curie in set(r.overlap_curie_combo)
    ),
    axis=1
)].sort_values(
    "fishers_exact_p"
)

#### By TRAPI result curie combo

In [40]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 5
count = 0
display_count_limit = 20

for trapi_result_curie_combo,df1 in subset_df.groupby("trapi_result_curie_combo", sort=False):
    trapi_curie_names = []
    for curie in df1["trapi_result_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        trapi_curie_names.append(name)

    cumulative_trapi_result_curie_combo_figure_score = list(set(
        df1["cumulative_trapi_result_curie_combo_figure_score"]
    ))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  <span style="font-weight: bold;">cumulative score: {cumulative_trapi_result_curie_combo_figure_score:.2f}</span>
  {", ".join(trapi_curie_names)}
  <span style="font-size: x-small;">({", ".join(trapi_result_curie_combo)})</span><br>
</div>
'''))

    overlap_names = []
    for curie in df1["overlap_curie_combo"].tolist()[0]:
        name = curie_to_name[curie]
        overlap_names.append(name)

    trapi_results_rows = []
    for i,row in df1[trapi_result_columns].drop_duplicates().iterrows():
        trapi_result_chunks = []
        for trapi_result_column in trapi_result_columns:
            row_value = row[trapi_result_column]
            if row_value in overlap_names:
                trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
            else:
                trapi_result_chunk = f'{row_value}'
            trapi_result_chunks.append(trapi_result_chunk)
        trapi_results_rows.append(
            f'{" - ".join(trapi_result_chunks)}'
        )
    trapi_results_row_count = len(trapi_results_rows)
    display(HTML(data=f'''
<div style="font-size: x-small;">
{"<br>".join(trapi_results_rows[:display_count_limit])}
</div>
'''))
    if trapi_results_row_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_count - display_count_limit} more")
    print("")

    figures = []
    row_height='100px'
    for figure_id,df0 in df1.groupby("figure_id"):
        [pmc,filename] = figure_id.split("__")
        figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
        figure_title = list(set(df0["figure_title"]))[0]

        score = list(set(df0["score"]))[0]

        figures.append(f'''
            <figure style="margin: 5px !important;">
              <img src="{figure_url}" style="height: {row_height}">
            </figure>
        ''')
    figure_count = len(figures)
    display(HTML(data=f'''
    <div style="display: flex; flex-flow: row wrap; text-align: center;">
    {''.join(figures[:display_count_limit])}
    </div>
'''))

    if figure_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {figure_count - display_count_limit} more")


    print("")
    count += 1
    if count >= limit:
        print("...")
        print(f'{len(set(subset_df["trapi_result_curie_combo"])) - limit} more TRAPI result curie combos')
        break





























...
4 more TRAPI result curie combos


#### By figure

In [41]:
from IPython.display import Image
from IPython.core.display import HTML 


limit = 3
count = 0
display_count_limit = 10

for figure_id,df0 in subset_df.groupby("figure_id", sort=False):
    [pmc,filename] = figure_id.split("__")
    figure_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmc}/bin/{filename}"
    figure_title = list(set(df0["figure_title"]))[0]

    display(HTML(data=f'''
<div style="border: 1px solid slategray; padding: 6px;">
  {figure_title}<br>
  <span style="font-size: x-small;">{figure_id}</span>
</div>
'''))
    display(Image(url=figure_url, width=600))

    trapi_results_row_sets = []
    for trapi_result_curie_combo,df1 in df0[
        trapi_result_columns + ["trapi_result_curie_combo", "overlap_curie_combo"]
    ].drop_duplicates().groupby("trapi_result_curie_combo", sort=False):
        overlap_names = []
        for curie in df1["overlap_curie_combo"].tolist()[0]:
            name = curie_to_name[curie]
            overlap_names.append(name)

        trapi_results_rows = []
        for i,row in df1.iterrows():
            trapi_result_chunks = []
            for trapi_result_column in trapi_result_columns:
                row_value = row[trapi_result_column]
                if row_value in overlap_names:
                    trapi_result_chunk = f'''
<span style="border: 1px orange solid; padding: 1px; border-radius: 4px;">{row_value}</span>
'''
                else:
                    trapi_result_chunk = f'{row_value}'
                trapi_result_chunks.append(trapi_result_chunk)
            trapi_results_rows.append(
                f'{" - ".join(trapi_result_chunks)}'
            )
        trapi_results_row_sets.append(f'''
<div style="font-size: x-small;">
{"<br>".join(trapi_results_rows)}
</div>
''')

    trapi_results_row_set_count = len(trapi_results_row_sets)
    display(HTML(data="<br>".join(trapi_results_row_sets[:display_count_limit])))
    if trapi_results_row_set_count > display_count_limit:
        print(f"...")
        print("")
        print(f"plus {trapi_results_row_set_count - display_count_limit} more")
    print("")

    count += 1
    if count >= limit:
        print("...")
        print(f'{len(set(results_with_figures_df["figure_id"])) - limit} more figures')
        break




## Next Steps

- How should we handle co-occurrence for genes vs. chemicals vs. diseases? We have many more genes than chemicals, and the number of diseases is rather small.