# Generate predictions
* Import predictions for probCBR and TransE
* Create a pipeline to get overlap information for each pair
* Get top overlaps

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import score_utils
import statistics
import re
import itertools
import pickle
from collections import defaultdict as dd

## Import probCBR results

In [2]:
def json_to_df(json_dir: str) -> pd.DataFrame:
    """
    Takes a string to a json object and turns it into a dataframe
    """
    # import the json object
    with open(json_dir, "r") as f:
        json_obj = json.load(f)

    # creates a dict with json keys as keys and values of emptylist based on the first json entry
    json_key_dict = {i: [] for i in list(json_obj[0].keys())}
    for i in json_obj:
        for j in json_key_dict.keys():
            # add j key in json item i to the list
            to_add = json_key_dict[j]
            to_add.append(i[j])
            json_key_dict.update({j: to_add})

    return pd.DataFrame(json_key_dict)

In [3]:
# dir to probCBR prediction results
df = json_to_df("/home/msinha/Open-BIo-Link/data.json")

### Import graph to filter prediction results

In [4]:
def process_dataframe(
    df: pd.DataFrame,
    data_dir: str,
    model_dir: str,
    mode: str = "tail-batch",
    rel=[
        "indication",
        "indication_CiD",
    ],
):
    """
    Gets True tail triples
    """

    # get true head/tail triples from graph
    raw = score_utils.ProcessOutput(
        data_dir=data_dir,
        scores_outfile=os.path.join(model_dir, "test_scores.tsv"),
        mode=mode,
    )
    graph = raw.get_true_targets()
    graph = graph.query("r in @rel")

    if mode == "tail-batch":
        # get true tail triples from graph

        merged = pd.merge(
            left=df[["e1", "r", "answers", "predicted_answers"]].rename(
                columns={"e1": "h"}
            ),
            right=graph,
            how="left",
            on=["h", "r"],
        )
        merged = merged.rename(columns={"t": "true_t"})

        # ensure answers is in 'true_t'. First remove all predicted answers with no predictions
        merged = merged[merged["predicted_answers"].apply(lambda x: len(x) > 0)]
        # add answers to true t
        merged["true_t"] = [
            list(set(val).union(set(merged.answers.iloc[ind])))
            if type(val) != float
            else list(set(merged.answers.iloc[ind]))
            for ind, val in enumerate(merged.true_t)
        ]

    elif mode == "head-batch":
        # get True head triples from graph
        merged = pd.merge(
            left=df[["e1", "r", "answers", "predicted_answers"]].rename(
                columns={"e1": "t"}
            ),
            right=graph,
            how="left",
            on=["r", "t"],
        )
        merged = merged.rename(columns={"t": "true_h", "h": "t"})

        # ensure answers is in 'true_t'. First remove all predicted answers with no predictions
        merged = merged[merged["predicted_answers"].apply(lambda x: len(x) > 0)]
        # add answers to true t
        merged["true_h"] = [
            list(set(val).union(set(merged.answers.iloc[ind])))
            if type(val) != float
            else list(set(merged.answers.iloc[ind]))
            for ind, val in enumerate(merged.true_h)
        ]

    else:
        raise ValueError(f'{batch} is not in {"head-batch","tail-batch"}')
    return merged


# def process_dataframe(
#     df: pd.DataFrame, graph_dir: str, batch: str = "tail", rel=["indication", "indication_CiD",]
# ):
#     """
#     Gets True tail triples
#     """

#     if batch == "tail":
#         # get true tail triples from graph
#         graph = get_known_triples(
#             graph_dir=graph_dir,
#             rel = rel,
#         )
#         merged = pd.merge(
#             left=df[["e1", "r", "answers", "predicted_answers"]].rename(
#                 columns={"e1": "h"}
#             ),
#             right=graph,
#             how="left",
#             on=["h","r"],
#         )
#         merged = merged.rename(columns={"t": "true_t"})

#         # ensure answers is in 'true_t'. First remove all predicted answers with no predictions
#         merged = merged[merged["predicted_answers"].apply(lambda x: len(x) > 0)]
#         # add answers to true t
#         merged["true_t"] = [
#             list(set(val) | set(merged.answers.iloc[ind]))
#             if type(val) != float
#             else list(set(merged.answers.iloc[ind]))
#             for ind, val in enumerate(merged.true_t)
#         ]

#     elif batch == "head":
#         # get True head triples from graph
#         graph = get_known_triples(
#             graph_dir=graph_dir, batch="head", rel=rel
#         )
#         merged = pd.merge(
#             left=df[["e1", "r", "answers", "predicted_answers"]].rename(
#                 columns={"e1": "t"}
#             ),
#             right=graph,
#             how="left",
#             on=["r",'t'],
#         )
#         merged = merged.rename(columns={"t": "true_h", "h": "t"})

#         # ensure answers is in 'true_t'. First remove all predicted answers with no predictions
#         merged = merged[merged["predicted_answers"].apply(lambda x: len(x) > 0)]
#         # add answers to true t
#         merged["true_h"] = [
#             list(set(val) | set(merged.answers.iloc[ind]))
#             if type(val) != float
#             else list(set(merged.answers.iloc[ind]))
#             for ind, val in enumerate(merged.true_h)
#         ]

#     else:
#         raise ValueError(f'{batch} is not in {"head","tail"}')
#     return merged


def filter_predictions(df: pd.DataFrame, top: int = 50) -> pd.DataFrame:
    """
    Get filtered predictions that don't exist as triples in Train/Test/Valid

    Inputs
    -----------
    - df        * dataframe to translate embeddings
    - top       * get the top 'n' results for each prediction.
                * `top = -1` if you want all results
    """
    # remove answer from predicted_answers
    df["filt_preds"] = df.apply(
        lambda i: score_utils.ProcessOutput.remove_list_from_list(
            i["predicted_answers"], i["answers"]
        ),
        axis=1,
    )
    df = df.rename(columns={"t": "true_t"}).drop(columns="predicted_answers")

    # get the top number of predictions
    df["filt_preds"] = df["filt_preds"].apply(lambda x: x[0:top])

    return df

In [5]:
df.head(2)

Unnamed: 0,e1,r,answers,predicted_answers
0,CHEBI:135735,indication,[DOID:10763],"[DOID:6432, DOID:6000, DOID:10763, DOID:10591,..."
1,DOID:10763,indication_inv,"[CHEBI:135735, CHEBI:135738, CHEBI:141521, CHE...","[CHEBI:31548, CHEBI:46632, CHEBI:5784, CHEBI:5..."


In [6]:
df.shape

(594, 4)

In [7]:
df = df.query('r=="indication"')
df.shape

(386, 4)

In [8]:
merged = process_dataframe(
    df=df,
    data_dir="../data/MIND_CtD/",
    model_dir="../models/TransE_MIND_CtD_megha/",
)

In [9]:
merged.head(2)

Unnamed: 0,h,r,answers,predicted_answers,true_t
0,CHEBI:135735,indication,[DOID:10763],"[DOID:6432, DOID:6000, DOID:10763, DOID:10591,...","[DOID:10824, DOID:10825, DOID:10591, DOID:1076..."
1,CHEBI:135738,indication,[DOID:10763],"[DOID:10763, DOID:446, DOID:10591, DOID:10824,...","[DOID:10824, DOID:10825, DOID:10591, DOID:1076..."


In [10]:
merged.shape

(359, 5)

In [11]:
merged["true_t"] = merged["true_t"].apply(lambda x: list(set(x)))
merged["true_t"].apply(lambda x: len(x)).describe()

count    359.000000
mean       8.064067
std       12.905709
min        1.000000
25%        2.000000
50%        4.000000
75%        7.000000
max       87.000000
Name: true_t, dtype: float64

In [12]:
missing = [i for i in list(set(df.e1)) if i not in set(merged.h)]

len(missing)

27

## these are the indications that don't have a prediction from probCBR
```python
df.query('r=="indication" & e1 in @missing')
```

In [13]:
# combine answers and true_t together
merged["answers"] += merged["true_t"]
merged["answers"] = merged["answers"].apply(lambda x: list(set(x)))
prob_cbr = merged.query('r=="indication"')

In [14]:
prob_cbr["answers"].apply(lambda x: len(x)).describe()

count    359.000000
mean       8.064067
std       12.905709
min        1.000000
25%        2.000000
50%        4.000000
75%        7.000000
max       87.000000
Name: answers, dtype: float64

In [15]:
# remove answers from predicted answers
prob_cbr = filter_predictions(prob_cbr, top=-1)

In [16]:
prob_cbr.head(2)

Unnamed: 0,h,r,answers,true_t,filt_preds
0,CHEBI:135735,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:6000, MESH:D018487, HP:0001681, HP:00009..."
1,CHEBI:135738,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:446, DOID:1070, DOID:5845, DOID:5846, DO..."


## Get TransE results

In [17]:
# import and process into a dataframe
raw = score_utils.ProcessOutput(
    data_dir="../data/MIND_CtD",
    scores_outfile="../models/TransE_MIND_CtD_megha/test_scores.tsv",
    mode="tail-batch",
)
raw.format_raw_scores_to_df()
raw.translate_embeddings(direction="from")
transe_df = raw.filter_predictions(top=100).query("batch=='tail-batch'")
transe_df["algo"] = "TransE"

In [18]:
transe_df = transe_df.sort_values(by="h").drop_duplicates("h")

In [19]:
transe_df2 = transe_df.query("h in @prob_cbr.h").sort_values(by="h")

In [20]:
transe_df2.true_t.apply(lambda x: list(set(x))).apply(lambda x: len(x)).describe()

count    359.000000
mean       8.064067
std       12.905709
min        1.000000
25%        2.000000
50%        4.000000
75%        7.000000
max       87.000000
Name: true_t, dtype: float64

In [22]:
# make sure the predictions don't have "h" in it
transe_df2['filt_preds'] = [[j for j in val if j!=list(transe_df2['h'])[i]] for i,val in enumerate(list(transe_df2['filt_preds']))]

In [23]:
transe_df2.head(2)

Unnamed: 0,h,r,preds,batch,true_t,filt_preds,algo
537,CHEBI:135735,indication,"[CHEBI:135735, DOID:9654, UMLS:C0221155, WD:Q2...",tail-batch,"[DOID:10591, DOID:10824, DOID:10825, DOID:1113...","[DOID:9654, UMLS:C0221155, WD:Q25303605, MONDO...",TransE
538,CHEBI:135738,indication,"[CHEBI:135738, DOID:10763, DOID:9654, WD:Q2530...",tail-batch,"[DOID:10591, DOID:10824, DOID:10825, DOID:1113...","[DOID:9654, WD:Q25303605, UMLS:C0221155, CHEBI...",TransE


In [24]:
transe_df2.shape

(359, 7)

In [25]:
len(set(transe_df.h))

387

In [26]:
# probCBR may not have found indications for the entire test.
# quck check that at least all of the indications are nested
len(set(transe_df.h) and set(prob_cbr.query('r=="indication"').h))

359

In [27]:
# make sure all true items are the same
prob_cbr = prob_cbr.sort_values("h")
transe_df2 = transe_df2.sort_values("h")

consolidated_true_t = list()
for i, val in enumerate(list(prob_cbr.h)):
    consolidated_true_t.append(
        list(
            set(prob_cbr.query("h==@val").true_t.iloc[0]).intersection(set(transe_df2.query("h==@val").true_t.iloc[0]))
        )
    )

prob_cbr["true_t"] = consolidated_true_t
transe_df2["true_t"] = consolidated_true_t

### Function for checking overlap between two lists

In [28]:
def overlap(x: list, y: list, n_overlap: int = 50, mode: str = "count") -> list:
    """
    Returns the overlap count of each item in a list of list by index
    Input:
    * x,y           list of lists to count overlaps between
    * n_overlap     number of overlap to consider
    * mode          {'count', 'percentage'}
    """
    import copy

    if len(x) > len(y):
        old_x = copy.deepcopy(x)
        old_y = copy.deepcopy(y)
        y = old_x
        x = old_y

    overlap_ls = []
    for i, v in enumerate(x):
        set_algo1 = set(v[0:n_overlap])
        set_algo2 = set(list(y)[i][0:n_overlap])

        if mode == "count":
            overlap_ls.append(len(set_algo1.intersection(set_algo2)))
        elif mode == "percentage":
            overlap_ls.append(
                len(set_algo1.intersection(set_algo2)) / len(v)
            )  # number of overlap/total possible overlap (on the smaller side)
        else:
            raise ValueError(f"mode {mode} not count or percentage")

    return overlap_ls

In [29]:
transe_df2 = transe_df2.drop(columns="batch")
prob_cbr["algo"] = "probCBR"
prob_cbr = prob_cbr.rename(columns={"answers": "preds"})

In [30]:
prob_cbr.head(2)

Unnamed: 0,h,r,preds,true_t,filt_preds,algo
0,CHEBI:135735,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:6000, MESH:D018487, HP:0001681, HP:00009...",probCBR
1,CHEBI:135738,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:446, DOID:1070, DOID:5845, DOID:5846, DO...",probCBR


In [31]:
transe_df2.head(2)

Unnamed: 0,h,r,preds,true_t,filt_preds,algo
537,CHEBI:135735,indication,"[CHEBI:135735, DOID:9654, UMLS:C0221155, WD:Q2...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:9654, UMLS:C0221155, WD:Q25303605, MONDO...",TransE
538,CHEBI:135738,indication,"[CHEBI:135738, DOID:10763, DOID:9654, WD:Q2530...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:9654, WD:Q25303605, UMLS:C0221155, CHEBI...",TransE


#### Example for Generating combination pairs
n choose 3
```python
x = list()
x+= itertools.combinations(['a','b','c'],3)
```
```
[('a', 'b', 'c'),]
```

n every 3.
```python
x= list()
x+= itertools.permutations(['a','b','c',],3)
```
```
[('a', 'b', 'c'),
 ('a', 'c', 'b'),
 ('b', 'a', 'c'),
 ('b', 'c', 'a'),
 ('c', 'a', 'b'),
 ('c', 'b', 'a')]
```

In [32]:
def overlap_list(x: list, y: list) -> list:
    """
    Returns a list of the overlap between two lists
    """
    return list(set(x) & set(y))


def overlap_list_location(x: list, y: list) -> list:
    """
    Takes two lists and returns the location of overlapping objects as a list(tuple)
    """
    unique = list(set(x) & set(y))

    return [(x.index(i), y.index(i)) for i in unique]


def overlap_diff(
    x: list,
    y: list,
) -> list:
    """
    takes two lists to compare as inputs
    returns the difference in ranks in overlapping objects in the two lists.
    """
    if len(x) > len(y):
        old_x = x
        old_y = y
        x = old_y
        y = old_x

    overlap_diff_ls = list()
    for i, v in enumerate(x):
        #        short_ls = []
        #        for w in v:
        loc_ls = overlap_list_location(v, y[i])
        overlap_diff_ls.append(
            [abs(j[0] - j[1]) for j in loc_ls]
        )  # difference between tuples

    return overlap_diff_ls

## Get an overlapping list of hits.
* before and after icd9/rxnorm filtration

### Get the highest overlapping objects between all algorithms
* Unfortunately, we can't get an overlap for all algorithms.

In [33]:
# function for determining the unique set for all algos per indication.
def overlap_counter(x: list) -> list:
    """
    takes a list of list and returns a list of intersections between all the lists supplied
    """
    # get a item count of all
    a_dict = dict()
    for k in [j for i in x for j in i]:
        if k not in a_dict.keys():
            a_dict[k] = 0
        else:
            a_dict[k] += 1
    return a_dict


# function for getting the n_overlapping objects in a list
def overlap_size(x: list, y: int = 1) -> list:
    overlap_dict = overlap_counter(x)

    # if the dictionary object has 'y' overlaps return the object into a list
    return [i[0] for i in overlap_dict.items() if i[1] >= y]


# function for overlap
def overlap_percentage(x: list, y: int) -> float:
    """
    takes a list of list and returns the length of unique items divided by the smallest list
    - y considers the minimum number of overlaps
    """
    num_overlaps = len(overlap_size(x, y))
    smallest_list = min([len(i) for i in x])
    return num_overlaps / smallest_list


def overlap_ent_extract(df1: pd.DataFrame, df2: pd.DataFrame, batch: str = "h") -> list:
    """
    takes two dataframes and extracts the intersection
    returns a list of the overlapping entities
    """
    if batch == "h":
        # head batch, given a head, predict tail
        overlap_ents = overlap_list(list(df1.h), list(df2.h))
        
    elif batch == "t":
        # tail batch, given a tail, predict head
        overlap_ents = overlap_list(list(df1.t), list(df2.t))
    else:
        raise ValueError(f"{batch} is not h | t")

    return overlap_ents


def overlap_list(x: list, y: list) -> list:
    """
    Returns a list of the overlap between two lists
    """
    return list(set(x).intersection(set(y)))


def overlap_list_location(x: list, y: list) -> list:
    """
    Takes two lists and returns the location of overlapping objects as a list(tuple)
    """
    unique = overlap_list(x, y)

    return [(x.index(i), y.index(i)) for i in unique]


def overlap_diff(
    x: list,
    y: list,
) -> list:
    """
    takes two lists to compare as inputs
    returns the difference in ranks in overlapping objects in the two lists.
    """
    if len(x) > len(y):
        old_x = x
        old_y = y
        x = old_y
        y = old_x

    overlap_diff_ls = list()
    for i, v in enumerate(x):
        #        short_ls = []
        #        for w in v:
        loc_ls = overlap_list_location(v, y[i])
        overlap_diff_ls.append(
            [abs(j[0] - j[1]) for j in loc_ls]
        )  # difference between tuples

    return overlap_diff_ls

## What about overlaps between the two best performing algo's on our dataset?
* Pick 2 best because we don't want to decrease our performance by using overlaps from worse performing algos

In [34]:
# get overlap score for overlapping items for each triple
overlap_res = overlap(
    transe_df2.sort_values(by="h").filt_preds,
    prob_cbr.sort_values(by="h").filt_preds,
    n_overlap=100,
    mode="percentage",
)
transe_df2["overlap"] = overlap_res
prob_cbr["overlap"] = overlap_res

In [35]:
transe_pcbr = pd.concat([transe_df2, prob_cbr])
transe_pcbr.shape[0] == 2 * prob_cbr.shape[0]

True

In [36]:
# max amount of overlap between the two algos used to be 0.33, now its 0.45.
(pd.Series(overlap_res)).describe()

count    359.000000
mean       0.140345
std        0.077910
min        0.000000
25%        0.085859
50%        0.141414
75%        0.181818
max        0.454545
dtype: float64

In [37]:
transe_pcbr_filt = transe_pcbr.sort_values(by=["h", "algo"])

In [38]:
transe_pcbr_filt

Unnamed: 0,h,r,preds,true_t,filt_preds,algo,overlap
537,CHEBI:135735,indication,"[CHEBI:135735, DOID:9654, UMLS:C0221155, WD:Q2...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:9654, UMLS:C0221155, WD:Q25303605, MONDO...",TransE,0.242424
0,CHEBI:135735,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:6000, MESH:D018487, HP:0001681, HP:00009...",probCBR,0.242424
538,CHEBI:135738,indication,"[CHEBI:135738, DOID:10763, DOID:9654, WD:Q2530...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:9654, WD:Q25303605, UMLS:C0221155, CHEBI...",TransE,0.222222
1,CHEBI:135738,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:446, DOID:1070, DOID:5845, DOID:5846, DO...",probCBR,0.222222
539,CHEBI:135876,indication,"[CHEBI:135876, DOID:11813, DOID:5432, DOID:118...","[DOID:11818, DOID:11821, DOID:5427, DOID:11593...","[DOID:11813, DOID:5432, DOID:11820, DOID:11809...",TransE,0.171717
...,...,...,...,...,...,...,...
378,IKEY:XYGBKMMCQDZQOZ-UHFFFAOYSA-M,indication,[DOID:8986],[DOID:8986],"[DOID:7148, DOID:2355, DOID:1826, DOID:6364, D...",probCBR,0.141414
1061,IKEY:YHIUPZFKHZTLSH-LXYIGGQGSA-N,indication,"[IKEY:YHIUPZFKHZTLSH-LXYIGGQGSA-N, CHEBI:13472...","[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[CHEBI:134725, DOID:1607, DOID:0110741, DOID:1...",TransE,0.000000
379,IKEY:YHIUPZFKHZTLSH-LXYIGGQGSA-N,indication,"[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[KEGG:hsa04930, MESH:D007249, HP:0100806, HP:0...",probCBR,0.000000
1064,UNII:6897GXD6OE,indication,"[UNII:6897GXD6OE, DOID:11758, DOID:583, DOID:1...","[DOID:13121, DOID:11758]","[DOID:583, DOID:1341, DOID:12450, IKEY:MQBDAEH...",TransE,0.121212


In [39]:
# get overlap filtered predictions

overlap_ls = []
for i, v in enumerate(transe_pcbr_filt.h):
    if i % 2 == 0:
        list_of_list_filt_preds = list(transe_pcbr_filt.query("h == @v").filt_preds)
        overlaps = overlap_list(
            x=list_of_list_filt_preds[0], y=list_of_list_filt_preds[1]
        )
        overlap_ls.append(overlaps)
    else:
        overlap_ls.append(overlap_ls[i - 1])

In [40]:
len(overlap_ls) == transe_pcbr_filt.shape[0]

True

In [41]:
# add overlaps to the dataframe
transe_pcbr_filt["overlaps"] = overlap_ls
transe_pcbr_filt.head()

Unnamed: 0,h,r,preds,true_t,filt_preds,algo,overlap,overlaps
537,CHEBI:135735,indication,"[CHEBI:135735, DOID:9654, UMLS:C0221155, WD:Q2...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:9654, UMLS:C0221155, WD:Q25303605, MONDO...",TransE,0.242424,"[WD:Q25303605, DOID:557, DOID:11503, DOID:1184..."
0,CHEBI:135735,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:6000, MESH:D018487, HP:0001681, HP:00009...",probCBR,0.242424,"[WD:Q25303605, DOID:557, DOID:11503, DOID:1184..."
538,CHEBI:135738,indication,"[CHEBI:135738, DOID:10763, DOID:9654, WD:Q2530...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:9654, WD:Q25303605, UMLS:C0221155, CHEBI...",TransE,0.222222,"[DOID:0060173, MESH:D018487, DOID:9654, DOID:1..."
1,CHEBI:135738,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:446, DOID:1070, DOID:5845, DOID:5846, DO...",probCBR,0.222222,"[DOID:0060173, MESH:D018487, DOID:9654, DOID:1..."
539,CHEBI:135876,indication,"[CHEBI:135876, DOID:11813, DOID:5432, DOID:118...","[DOID:11818, DOID:11821, DOID:5427, DOID:11593...","[DOID:11813, DOID:5432, DOID:11820, DOID:11809...",TransE,0.171717,"[DOID:0050908, DOID:2394, DOID:8864, DOID:4000..."


In [42]:
transe_pcbr_filt

Unnamed: 0,h,r,preds,true_t,filt_preds,algo,overlap,overlaps
537,CHEBI:135735,indication,"[CHEBI:135735, DOID:9654, UMLS:C0221155, WD:Q2...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:9654, UMLS:C0221155, WD:Q25303605, MONDO...",TransE,0.242424,"[WD:Q25303605, DOID:557, DOID:11503, DOID:1184..."
0,CHEBI:135735,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:6000, MESH:D018487, HP:0001681, HP:00009...",probCBR,0.242424,"[WD:Q25303605, DOID:557, DOID:11503, DOID:1184..."
538,CHEBI:135738,indication,"[CHEBI:135738, DOID:10763, DOID:9654, WD:Q2530...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:9654, WD:Q25303605, UMLS:C0221155, CHEBI...",TransE,0.222222,"[DOID:0060173, MESH:D018487, DOID:9654, DOID:1..."
1,CHEBI:135738,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:446, DOID:1070, DOID:5845, DOID:5846, DO...",probCBR,0.222222,"[DOID:0060173, MESH:D018487, DOID:9654, DOID:1..."
539,CHEBI:135876,indication,"[CHEBI:135876, DOID:11813, DOID:5432, DOID:118...","[DOID:11818, DOID:11821, DOID:5427, DOID:11593...","[DOID:11813, DOID:5432, DOID:11820, DOID:11809...",TransE,0.171717,"[DOID:0050908, DOID:2394, DOID:8864, DOID:4000..."
...,...,...,...,...,...,...,...,...
378,IKEY:XYGBKMMCQDZQOZ-UHFFFAOYSA-M,indication,[DOID:8986],[DOID:8986],"[DOID:7148, DOID:2355, DOID:1826, DOID:6364, D...",probCBR,0.141414,"[DOID:594, MESH:D002471, DOID:14320, DOID:1470..."
1061,IKEY:YHIUPZFKHZTLSH-LXYIGGQGSA-N,indication,"[IKEY:YHIUPZFKHZTLSH-LXYIGGQGSA-N, CHEBI:13472...","[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[CHEBI:134725, DOID:1607, DOID:0110741, DOID:1...",TransE,0.000000,[]
379,IKEY:YHIUPZFKHZTLSH-LXYIGGQGSA-N,indication,"[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[KEGG:hsa04930, MESH:D007249, HP:0100806, HP:0...",probCBR,0.000000,[]
1064,UNII:6897GXD6OE,indication,"[UNII:6897GXD6OE, DOID:11758, DOID:583, DOID:1...","[DOID:13121, DOID:11758]","[DOID:583, DOID:1341, DOID:12450, IKEY:MQBDAEH...",TransE,0.121212,"[DOID:583, DOID:720, DOID:2355, DOID:12449, DO..."


In [43]:
# get the overlap_ranks for each algorithm predictions
def find_overlap_rank(df: pd.DataFrame) -> dict:
    """
    Takes a dataframe with a column filt_preds and overlaps, find the ranks of each overlap in filt_preds
    """
    rank_dict_ls = list()
    for i, v in enumerate(list(df.filt_preds)):
        overlapping_items = list(df.overlaps)[i]

        rank_dict = dict()
        for j, w in enumerate(v):
            if w in overlapping_items:
                rank_dict.update({w: j + 1})  # +1 because py index at 0

        rank_dd = dd(None, rank_dict)
        rank_dd = sorted(rank_dd.items(), key=lambda v: v[1])
        rank_dict_ls.append(rank_dd)

    return rank_dict_ls

In [44]:
transe_pcbr_filt["overlap_rank"] = find_overlap_rank(transe_pcbr_filt)
transe_pcbr_filt

Unnamed: 0,h,r,preds,true_t,filt_preds,algo,overlap,overlaps,overlap_rank
537,CHEBI:135735,indication,"[CHEBI:135735, DOID:9654, UMLS:C0221155, WD:Q2...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:9654, UMLS:C0221155, WD:Q25303605, MONDO...",TransE,0.242424,"[WD:Q25303605, DOID:557, DOID:11503, DOID:1184...","[(DOID:9654, 1), (UMLS:C0221155, 2), (WD:Q2530..."
0,CHEBI:135735,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:6000, MESH:D018487, HP:0001681, HP:00009...",probCBR,0.242424,"[WD:Q25303605, DOID:557, DOID:11503, DOID:1184...","[(DOID:6000, 1), (DOID:1067, 8), (DOID:14557, ..."
538,CHEBI:135738,indication,"[CHEBI:135738, DOID:10763, DOID:9654, WD:Q2530...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:9654, WD:Q25303605, UMLS:C0221155, CHEBI...",TransE,0.222222,"[DOID:0060173, MESH:D018487, DOID:9654, DOID:1...","[(DOID:9654, 1), (WD:Q25303605, 2), (UMLS:C022..."
1,CHEBI:135738,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:446, DOID:1070, DOID:5845, DOID:5846, DO...",probCBR,0.222222,"[DOID:0060173, MESH:D018487, DOID:9654, DOID:1...","[(DOID:446, 1), (DOID:1067, 13), (DOID:1875, 1..."
539,CHEBI:135876,indication,"[CHEBI:135876, DOID:11813, DOID:5432, DOID:118...","[DOID:11818, DOID:11821, DOID:5427, DOID:11593...","[DOID:11813, DOID:5432, DOID:11820, DOID:11809...",TransE,0.171717,"[DOID:0050908, DOID:2394, DOID:8864, DOID:4000...","[(DOID:6901, 8), (DOID:8864, 16), (DOID:5117, ..."
...,...,...,...,...,...,...,...,...,...
378,IKEY:XYGBKMMCQDZQOZ-UHFFFAOYSA-M,indication,[DOID:8986],[DOID:8986],"[DOID:7148, DOID:2355, DOID:1826, DOID:6364, D...",probCBR,0.141414,"[DOID:594, MESH:D002471, DOID:14320, DOID:1470...","[(DOID:1826, 3), (DOID:631, 14), (DOID:3328, 1..."
1061,IKEY:YHIUPZFKHZTLSH-LXYIGGQGSA-N,indication,"[IKEY:YHIUPZFKHZTLSH-LXYIGGQGSA-N, CHEBI:13472...","[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[CHEBI:134725, DOID:1607, DOID:0110741, DOID:1...",TransE,0.000000,[],[]
379,IKEY:YHIUPZFKHZTLSH-LXYIGGQGSA-N,indication,"[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[KEGG:hsa04930, MESH:D007249, HP:0100806, HP:0...",probCBR,0.000000,[],[]
1064,UNII:6897GXD6OE,indication,"[UNII:6897GXD6OE, DOID:11758, DOID:583, DOID:1...","[DOID:13121, DOID:11758]","[DOID:583, DOID:1341, DOID:12450, IKEY:MQBDAEH...",TransE,0.121212,"[DOID:583, DOID:720, DOID:2355, DOID:12449, DO...","[(DOID:583, 1), (DOID:12450, 3), (DOID:2355, 7..."


In [45]:
# calculate mean rank between the TransE and probCBR predictions
overlap_mean_ls = []
for i, v in enumerate(transe_pcbr_filt.h):

    if i % 2 == 0:
        list_of_overlap = list(transe_pcbr_filt.query("h == @v").overlap_rank)
        # create dictionary from list of tuples
        dict1 = dict(
            zip([i[0] for i in list_of_overlap[0]], [i[1] for i in list_of_overlap[0]])
        )
        dict2 = dict(
            zip([i[0] for i in list_of_overlap[1]], [i[1] for i in list_of_overlap[1]])
        )

        # find correspond dict values per key, add that to a new dictionary. convert to a default dict and sort
        mean_dict = dict()
        for j in dict1.keys():
            mean_dict.update({j: statistics.mean([dict1[j], dict2[j]])})
        mean_ddict = dd(None, mean_dict)
        mean_ddict = sorted(mean_ddict.items(), key=lambda v: v[1])
        overlap_mean_ls.append(mean_ddict)
    else:
        overlap_mean_ls.append(overlap_mean_ls[i - 1])

In [46]:
transe_pcbr_filt["mean_overlap_rank"] = overlap_mean_ls

In [47]:
# calculate summary stats for the overlap. Then filter
transe_pcbr_filt.overlap.describe()

count    718.000000
mean       0.140345
std        0.077856
min        0.000000
25%        0.083333
50%        0.141414
75%        0.181818
max        0.454545
Name: overlap, dtype: float64

In [48]:
transe_pcbr_filt2 = transe_pcbr_filt.sort_values(
    by=["overlap", "h"], ascending=False
).query("overlap>0")
transe_pcbr_filt2

Unnamed: 0,h,r,preds,true_t,filt_preds,algo,overlap,overlaps,overlap_rank,mean_overlap_rank
1048,CHEBI:9620,indication,"[CHEBI:9620, DOID:12403, DOID:0080161, DOID:13...","[DOID:11917, DOID:13369, DOID:12179, DOID:9060...","[DOID:0080161, DOID:12404, DOID:2272, DOID:433...",TransE,0.454545,"[DOID:0050271, DOID:11341, DOID:1564, DOID:130...","[(DOID:0080161, 1), (DOID:12404, 2), (DOID:227...","[(DOID:0080161, 2), (DOID:2272, 2), (DOID:5027..."
368,CHEBI:9620,indication,"[DOID:11917, DOID:13369, DOID:12179, DOID:9060...","[DOID:11917, DOID:13369, DOID:12179, DOID:9060...","[DOID:2272, DOID:14262, DOID:0080161, DOID:005...",probCBR,0.454545,"[DOID:0050271, DOID:11341, DOID:1564, DOID:130...","[(DOID:2272, 1), (DOID:14262, 2), (DOID:008016...","[(DOID:0080161, 2), (DOID:2272, 2), (DOID:5027..."
755,CHEBI:480999,indication,"[CHEBI:480999, DOID:2153, DOID:5117, DOID:2145...","[DOID:3910, DOID:4556, DOID:3907, DOID:3908]","[DOID:2153, DOID:5117, DOID:2145, DOID:2143, D...",TransE,0.383838,"[DOID:3016, DOID:2394, DOID:8691, WD:Q6883839,...","[(DOID:2153, 1), (DOID:5117, 2), (DOID:2145, 3...","[(DOID:5410, 9), (DOID:2394, 9), (DOID:5623, 1..."
149,CHEBI:480999,indication,"[DOID:3910, DOID:4556, DOID:3907, DOID:3908]","[DOID:3910, DOID:4556, DOID:3907, DOID:3908]","[DOID:2394, DOID:5520, DOID:3905, DOID:5410, D...",probCBR,0.383838,"[DOID:3016, DOID:2394, DOID:8691, WD:Q6883839,...","[(DOID:2394, 1), (DOID:5520, 2), (DOID:5410, 4...","[(DOID:5410, 9), (DOID:2394, 9), (DOID:5623, 1..."
628,CHEBI:31686,indication,"[CHEBI:31686, DOID:1614, MESH:D016609, DOID:30...","[DOID:8552, DOID:1039, DOID:1037, DOID:8864, D...","[DOID:1614, MESH:D016609, DOID:3016, DOID:1240...",TransE,0.373737,"[DOID:2394, DOID:2153, DOID:3008, DOID:7983, D...","[(DOID:1614, 1), (DOID:1240, 4), (DOID:3011, 5...","[(DOID:1240, 2.5), (DOID:1614, 6.5), (DOID:104..."
...,...,...,...,...,...,...,...,...,...,...
44,CHEBI:2930,indication,"[KEGG:hsa05323, DOID:7148, WD:Q3281303]","[KEGG:hsa05323, WD:Q3281303, DOID:7148]","[DOID:1339, DOID:8577, DOID:8761, DOID:13832, ...",probCBR,0.010101,[DOID:9008],"[(DOID:9008, 11)]","[(DOID:9008, 39)]"
591,CHEBI:28748,indication,"[CHEBI:28748, DOID:10540, DOID:11054, DOID:445...","[DOID:1319, WD:Q6883839, DOID:10290, DOID:0060...","[DOID:10540, DOID:11054, DOID:4450, DOID:4235,...",TransE,0.010101,[DOID:8577],"[(DOID:8577, 67)]","[(DOID:8577, 77)]"
42,CHEBI:28748,indication,"[DOID:1319, WD:Q6883839, DOID:10290, DOID:0060...","[DOID:1319, WD:Q6883839, DOID:10290, DOID:0060...","[MONDO:0014407, DOID:0090072, DOID:0060010, DO...",probCBR,0.010101,[DOID:8577],"[(DOID:8577, 87)]","[(DOID:8577, 77)]"
547,CHEBI:136043,indication,"[CHEBI:136043, CHEBI:136042, IKEY:FKCMADOPPWWG...","[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[CHEBI:136042, IKEY:FKCMADOPPWWGNZ-MQWKRIRWSA-...",TransE,0.010101,[MESH:D008106],"[(MESH:D008106, 63)]","[(MESH:D008106, 46.5)]"


In [49]:
# How many remain after 25% overlap filter?
transe_pcbr_filt.sort_values(
    by=["overlap", "h"], ascending=False
).query("overlap>.25").shape

(66, 10)

In [50]:
transe_pcbr_filt.sort_values(
    by=["overlap", "h"], ascending=False
).query("overlap>.25")[['h','r','algo','overlap','mean_overlap_rank']].explode('mean_overlap_rank')

Unnamed: 0,h,r,algo,overlap,mean_overlap_rank
1048,CHEBI:9620,indication,TransE,0.454545,"(DOID:0080161, 2)"
1048,CHEBI:9620,indication,TransE,0.454545,"(DOID:2272, 2)"
1048,CHEBI:9620,indication,TransE,0.454545,"(DOID:50275, 5)"
1048,CHEBI:9620,indication,TransE,0.454545,"(DOID:4337, 5.5)"
1048,CHEBI:9620,indication,TransE,0.454545,"(DOID:13074, 6)"
...,...,...,...,...,...
86,CHEBI:3387,indication,probCBR,0.252525,"(DOID:1470, 60.5)"
86,CHEBI:3387,indication,probCBR,0.252525,"(DOID:9008, 63)"
86,CHEBI:3387,indication,probCBR,0.252525,"(DOID:14228, 65.5)"
86,CHEBI:3387,indication,probCBR,0.252525,"(MONDO:0009945, 72)"


In [51]:
ent_mapping = pd.concat(
    [
        pd.read_csv("/home/msinha/MRN_dataset/node_biolink.csv")[["id", "name"]],
        pd.read_csv("../../../MRN_dataset/nodes_biolink.csv")[["id", "name"]],
    ]
).drop_duplicates()

  pd.read_csv("/home/msinha/MRN_dataset/node_biolink.csv")[["id", "name"]],
  pd.read_csv("../../../MRN_dataset/nodes_biolink.csv")[["id", "name"]],


In [52]:
ent_mapping.head(2)

Unnamed: 0,id,name
0,UBERON:0000002,cervix
1,UBERON:0000004,human nose


In [53]:
ent_mapping_dict = dict(zip(list(ent_mapping.id), list(ent_mapping.name)))

In [54]:
# Transform codes into Human readable drugs and diseases
trans_pcbr_filt_human = transe_pcbr_filt.copy()
trans_pcbr_filt_human.h = transe_pcbr_filt.h.apply(lambda x: ent_mapping_dict[x])
trans_pcbr_filt_human.filt_preds = transe_pcbr_filt.filt_preds.apply(
    lambda x: [ent_mapping_dict[i] for i in x]
)
trans_pcbr_filt_human.overlaps = transe_pcbr_filt.overlaps.apply(
    lambda x: [ent_mapping_dict[i] for i in x]
)

trans_pcbr_filt_human.overlap_rank = transe_pcbr_filt.overlap_rank.apply(
    lambda x: [(ent_mapping_dict[i[0]], i[1]) for i in x]
)

trans_pcbr_filt_human.mean_overlap_rank = transe_pcbr_filt.mean_overlap_rank.apply(
    lambda x: [(ent_mapping_dict[i[0]], i[1]) for i in x]
)

trans_pcbr_filt_human.head(2)

Unnamed: 0,h,r,preds,true_t,filt_preds,algo,overlap,overlaps,overlap_rank,mean_overlap_rank
537,delapril,indication,"[CHEBI:135735, DOID:9654, UMLS:C0221155, WD:Q2...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[obsolete hypertension complicating pregnancy,...",TransE,0.242424,"[Hypertensive crisis, kidney disease, diabetic...",[(obsolete hypertension complicating pregnancy...,"[(congestive heart failure, 3.5), (primary pul..."
0,delapril,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[congestive heart failure, Asymptomatic left v...",probCBR,0.242424,"[Hypertensive crisis, kidney disease, diabetic...","[(congestive heart failure, 1), (open-angle gl...","[(congestive heart failure, 3.5), (primary pul..."


In [55]:
trans_pcbr_filt_human.shape

(718, 10)

In [56]:
search = ["butenafine hydrochloride","econazole","Enasidenib","imatinib methanesulfonate","methoxsalen","Phendimetrazine","Rizatriptan benzoate","tolbutamide","vincaleukoblastine"]

In [57]:
trans_pcbr_filt_human.query('h in @search and algo =="TransE"')

Unnamed: 0,h,r,preds,true_t,filt_preds,algo,overlap,overlaps,overlap_rank,mean_overlap_rank
564,methoxsalen,indication,"[CHEBI:18358, DOID:12336, DOID:9952, DOID:684,...","[DOID:8691, KEGG:hsa05215, DOID:0060061, DOID:...","[male infertility, acute lymphoblastic leukemi...",TransE,0.20202,"[ovarian cancer, breast cancer, Hepatic injury...","[(pancreatic cancer, 5), (breast cancer, 8), (...","[(breast cancer, 11.5), (breast carcinoma, 12...."
579,vincaleukoblastine,indication,"[CHEBI:27375, DOID:4441, MONDO:0015533, WD:Q80...","[DOID:5842, DOID:4436, DOID:6110, DOID:8691, D...","[dysgerminoma, benign cephalic histiocytosis, ...",TransE,0.353535,"[prolymphocytic leukemia, rhabdomyosarcoma, ov...","[(dysgerminoma, 1), (adenosquamous breast carc...","[(megakaryocytic leukemia, 23), (ovarian cance..."
584,tolbutamide,indication,"[CHEBI:27999, DOID:9352, DOID:10182, KEGG:hsa0...","[DOID:1837, DOID:9352, KEGG:hsa04930, DOID:117...","[Liver Neoplasms, Experimental, Glioma, long Q...",TransE,0.242424,"[nasopharynx carcinoma, hyperparathyroidism, l...","[(hypertrophic cardiomyopathy, 5), (kidney can...","[(Diabetes Mellitus, Experimental, 9), (hypert..."
611,butenafine hydrochloride,indication,"[CHEBI:31325, DOID:0050116, DOID:13368, DOID:5...","[DOID:9060, DOID:12403]","[tinea imbricata, tinea profunda, Onychomycosi...",TransE,0.222222,"[drug allergy, obsolete fungal lung infectious...","[(Onychomycosis, 3), (obsolete tinea, 4), (tin...","[(Onychomycosis, 3), (tinea corporis, 5.5), (c..."
629,imatinib methanesulfonate,indication,"[CHEBI:31690, DOID:80147, DOID:3963, DOID:6006...","[DOID:7757, DOID:5560, DOID:8864, DOID:3264, M...","[T-cell lymphoblastic lymphoma, thyroid gland ...",TransE,0.282828,"[skin melanoma, Myelodysplastic syndromes, dys...","[(thyroid gland carcinoma, 2), (Cutaneous T-ce...","[(thyroid gland carcinoma, 13), (multiple myel..."
966,Phendimetrazine,indication,"[CHEBI:8059, DOID:0060814, MESH:D063766, DOID:...","[DOID:11981, DOID:9970]","[Wilson-Turner syndrome, childhood obesity, su...",TransE,0.191919,"[schizophrenia, major depressive disorder, fro...","[(Wilson-Turner syndrome, 1), (childhood obesi...","[(Wilson-Turner syndrome, 3), (childhood obesi..."
980,econazole,indication,"[CHEBI:82873, DOID:12711, MESH:C536777, MESH:D...","[DOID:0080161, DOID:9060, DOID:12403]","[black piedra, Systemic candidiasis, fungal ey...",TransE,0.252525,"[scabies, tinea unguium, American histoplasmos...","[(Systemic candidiasis, 2), (chromoblastomycos...","[(Systemic candidiasis, 4.5), (Onychomycosis, ..."
1013,Rizatriptan benzoate,indication,"[CHEBI:8875, DOID:12783, UMLS:C2349465, DOID:1...","[DOID:6364, DOID:10024, WD:Q7604533]","[migraine without aura, Persistent aura withou...",TransE,0.181818,"[anxiety disorder, status epilepticus, extrate...","[(migraine without aura, 1), (Persistent aura ...","[(major depressive disorder, 13), (status epil..."
1059,Enasidenib,indication,"[DOID:0070323, WD:Q4677939, WD:Q6865354, MESH:...","[DOID:8864, DOID:9119, WD:Q1110197, DOID:00603...","[childhood acute myeloid leukemia, Acute myelo...",TransE,0.313131,"[Myelodysplastic syndromes, childhood leukemia...","[(Acute erythroid leukemia, 11), (mast-cell le...","[(leukemia, 16), (myeloid leukemia, 30.5), (B-..."


In [58]:
# lets cut that down to size, get only the top 20 mean overlap rank
def filter_mean_overlap_rank(df: pd.DataFrame, val: int = 20) -> pd.DataFrame:
    """
    Returns a dataframe with column "filtered mean overlap rank" with overlap rank list of less than "val" specified.
    """
    short_ls = []
    for i, v in enumerate(df.h):

        if i % 2 == 0:
            list_of_overlap = list(df.query("h == @v").mean_overlap_rank)
            overlap_less_than_x = [j for j in list_of_overlap[0] if j[1] <= val]
            short_ls.append(overlap_less_than_x)

        else:
            short_ls.append(short_ls[i - 1])

    df["filt_mean_overlap_rank"] = short_ls

    return df

In [59]:
trans_pcbr_filt_human20 = filter_mean_overlap_rank(trans_pcbr_filt_human, 20)
trans_pcbr_filt_human20

Unnamed: 0,h,r,preds,true_t,filt_preds,algo,overlap,overlaps,overlap_rank,mean_overlap_rank,filt_mean_overlap_rank
537,delapril,indication,"[CHEBI:135735, DOID:9654, UMLS:C0221155, WD:Q2...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[obsolete hypertension complicating pregnancy,...",TransE,0.242424,"[Hypertensive crisis, kidney disease, diabetic...",[(obsolete hypertension complicating pregnancy...,"[(congestive heart failure, 3.5), (primary pul...","[(congestive heart failure, 3.5), (primary pul..."
0,delapril,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[congestive heart failure, Asymptomatic left v...",probCBR,0.242424,"[Hypertensive crisis, kidney disease, diabetic...","[(congestive heart failure, 1), (open-angle gl...","[(congestive heart failure, 3.5), (primary pul...","[(congestive heart failure, 3.5), (primary pul..."
538,clevidipine,indication,"[CHEBI:135738, DOID:10763, DOID:9654, WD:Q2530...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[obsolete hypertension complicating pregnancy,...",TransE,0.222222,"[Timothy syndrome, Asymptomatic left ventricul...",[(obsolete hypertension complicating pregnancy...,"[(primary hyperaldosteronism, 9.5), (open-angl...","[(primary hyperaldosteronism, 9.5), (open-angl..."
1,clevidipine,indication,"[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[DOID:10824, DOID:10825, DOID:6432, DOID:11130...","[primary hyperaldosteronism, primary open angl...",probCBR,0.222222,"[Timothy syndrome, Asymptomatic left ventricul...","[(primary hyperaldosteronism, 1), (open-angle ...","[(primary hyperaldosteronism, 9.5), (open-angl...","[(primary hyperaldosteronism, 9.5), (open-angl..."
539,valrubicin,indication,"[CHEBI:135876, DOID:11813, DOID:5432, DOID:118...","[DOID:11818, DOID:11821, DOID:5427, DOID:11593...","[bladder trigone cancer, bladder papillary tra...",TransE,0.171717,"[myelodysplastic syndrome, ovarian cancer, acu...","[(familiar ovarian carcinoma, 8), (acute monoc...","[(B-cell lymphoma, 24), (prostate lymphoma, 27...",[]
...,...,...,...,...,...,...,...,...,...,...,...
378,gamma-Hydroxybutyrate sodium,indication,[DOID:8986],[DOID:8986],"[rheumatoid arthritis, anemia, epilepsy, migra...",probCBR,0.141414,"[panic disorder, dysplasia, generalized anxiet...","[(epilepsy, 3), (fibromyalgia, 14), (temporal ...","[(epilepsy, 9.5), (fibromyalgia, 10.5), (front...","[(epilepsy, 9.5), (fibromyalgia, 10.5), (front..."
1061,Ertugliflozin,indication,"[IKEY:YHIUPZFKHZTLSH-LXYIGGQGSA-N, CHEBI:13472...","[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[luseogliflozin, hypoglycemic coma, type 1 dia...",TransE,0.000000,[],[],[],[]
379,Ertugliflozin,indication,"[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[DOID:1837, DOID:9352, DOID:10182, DOID:11712]","[Type II diabetes mellitus, inflammation, Seps...",probCBR,0.000000,[],[],[],[]
1064,Ferric Carboxymaltose,indication,"[UNII:6897GXD6OE, DOID:11758, DOID:583, DOID:1...","[DOID:13121, DOID:11758]","[hemolytic anemia, obsolete congenital anemia,...",TransE,0.121212,"[hemolytic anemia, normocytic anemia, anemia, ...","[(hemolytic anemia, 1), (pancytopenia, 3), (an...","[(hemolytic anemia, 2), (anemia, 4), (pancytop...","[(hemolytic anemia, 2), (anemia, 4), (pancytop..."


In [60]:
# There are 550 TransE and pCBR indications with average overlap ranks of less than 20
trans_pcbr_filt_human20[
    trans_pcbr_filt_human20["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
].shape

(556, 11)

In [61]:
# filtered for the top 20 mean overlap rank and exploded it. Got 921 potential hits

trans_pcbr_filt_human20[
    trans_pcbr_filt_human20["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).sort_values(
    ["h", "algo"]
)

Unnamed: 0,h,r,algo,overlap,overlap_rank,filt_mean_overlap_rank
1040,1-[(4-chlorophenyl)-phenylmethyl]-4-methylpipe...,indication,TransE,0.090909,"[(perennial allergic rhinitis, 2), (motion sic...","(perennial allergic rhinitis, 16.5)"
361,1-[(4-chlorophenyl)-phenylmethyl]-4-methylpipe...,indication,probCBR,0.090909,"[(heart disease, 11), (perennial allergic rhin...","(perennial allergic rhinitis, 16.5)"
1042,"2-(1,3-dimethyl-2,6-dioxo-7-purinyl)acetic acid",indication,TransE,0.101010,"[(pulmonary emphysema, 4), (bronchiectasis, 18...","(pulmonary emphysema, 2.5)"
363,"2-(1,3-dimethyl-2,6-dioxo-7-purinyl)acetic acid",indication,probCBR,0.101010,"[(pulmonary emphysema, 1), (Rheumatoid arthrit...","(pulmonary emphysema, 2.5)"
988,"3-\{2-[4-(6-fluoro-1,2-benzoxazol-3-yl)piperid...",indication,TransE,0.111111,"[(generalized anxiety disorder, 1), (bipolar d...","(bipolar disorder, 5)"
...,...,...,...,...,...,...
300,zofenopril,indication,probCBR,0.181818,"[(congestive heart failure, 1), (Asymptomatic ...","(inferolateral myocardial infarct, 16.5)"
300,zofenopril,indication,probCBR,0.181818,"[(congestive heart failure, 1), (Asymptomatic ...","(posterior myocardial infarction, 16.5)"
300,zofenopril,indication,probCBR,0.181818,"[(congestive heart failure, 1), (Asymptomatic ...","(lateral myocardial infarction, 19)"
742,zoledronic acid,indication,TransE,0.080808,"[(megakaryocytic leukemia, 27), (Chronic myelo...","(Chronic myeloid leukemia, 18)"


In [62]:
# Unique compund heads after less than 20 rank
len(set(trans_pcbr_filt_human20[
    trans_pcbr_filt_human20["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).sort_values(
    ["h", "algo"]
).h))

278

In [63]:
# Unique compound heads after less than 20 rank and greater than or equalto 25% overlap
len(set(trans_pcbr_filt_human20[
    trans_pcbr_filt_human20["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).query(
    "overlap>=.25"
).h))

32

In [64]:
# Unique compound heads after less than 20 rank and greater than 25% overlap
len(set(trans_pcbr_filt_human20[
    trans_pcbr_filt_human20["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).query(
    "overlap>.25"
).h))

32

In [65]:
# Which are the Unique compound heads after less than 20 rank and greater than or equal to 25% overlap
set(trans_pcbr_filt_human20[
    trans_pcbr_filt_human20["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).query(
    "overlap>=.25"
).h)

{'BOL-303259-X',
 'Enasidenib',
 'Fluvoxamine maleate',
 'Idarubicin hydrochloride',
 'Medrysone',
 'Thiothixene',
 'actinomycin D',
 'alfacalcidol',
 'almotriptan',
 'bitolterol',
 'carbamazepine',
 'carvedilol',
 'desonide',
 'econazole',
 'everolimus',
 'imatinib methanesulfonate',
 'imiquimod',
 'leuprolide',
 'lisdexamfetamine',
 'lurasidone hydrochloride',
 'naftifine',
 'orlistat',
 'phentermine',
 'potassium citrate (anhydrous)',
 'streptomycin',
 'temozolomide',
 'teniposide',
 'teriflunomide',
 'tioguanine',
 'tolnaftate',
 'triamcinolone',
 'vinorelbine'}

In [66]:
set(trans_pcbr_filt_human20[
    trans_pcbr_filt_human20["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).query(
    "overlap>=.25"
).h).intersection(search)

{'Enasidenib', 'econazole', 'imatinib methanesulfonate'}

In [67]:
trans_pcbr_filt_human20[
    trans_pcbr_filt_human20["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).query(
    "overlap>=.25"
)

Unnamed: 0,h,r,algo,overlap,overlap_rank,filt_mean_overlap_rank
541,lisdexamfetamine,indication,TransE,0.272727,"[(narcolepsy, 1), (cognitive disorder, 4), (ma...","(major depressive disorder, 3.5)"
541,lisdexamfetamine,indication,TransE,0.272727,"[(narcolepsy, 1), (cognitive disorder, 4), (ma...","(autistic disorder, 5.5)"
541,lisdexamfetamine,indication,TransE,0.272727,"[(narcolepsy, 1), (cognitive disorder, 4), (ma...","(obesity, 7.5)"
541,lisdexamfetamine,indication,TransE,0.272727,"[(narcolepsy, 1), (cognitive disorder, 4), (ma...","(Parkinson's disease, 12)"
541,lisdexamfetamine,indication,TransE,0.272727,"[(narcolepsy, 1), (cognitive disorder, 4), (ma...","(paranoid schizophrenia, 16.5)"
...,...,...,...,...,...,...
376,BOL-303259-X,indication,probCBR,0.353535,"[(glaucoma, 3), (angle-closure glaucoma, 4), (...","(neovascular glaucoma, 9)"
376,BOL-303259-X,indication,probCBR,0.353535,"[(glaucoma, 3), (angle-closure glaucoma, 4), (...","(secondary hypertension, 14.5)"
376,BOL-303259-X,indication,probCBR,0.353535,"[(glaucoma, 3), (angle-closure glaucoma, 4), (...","(pulmonary hypertension, 20)"
1059,Enasidenib,indication,TransE,0.313131,"[(Acute erythroid leukemia, 11), (mast-cell le...","(leukemia, 16)"


In [68]:
trans_pcbr_filt_human20[
    trans_pcbr_filt_human20["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).query(
    "overlap>.25"
)

Unnamed: 0,h,r,algo,overlap,overlap_rank,filt_mean_overlap_rank
541,lisdexamfetamine,indication,TransE,0.272727,"[(narcolepsy, 1), (cognitive disorder, 4), (ma...","(major depressive disorder, 3.5)"
541,lisdexamfetamine,indication,TransE,0.272727,"[(narcolepsy, 1), (cognitive disorder, 4), (ma...","(autistic disorder, 5.5)"
541,lisdexamfetamine,indication,TransE,0.272727,"[(narcolepsy, 1), (cognitive disorder, 4), (ma...","(obesity, 7.5)"
541,lisdexamfetamine,indication,TransE,0.272727,"[(narcolepsy, 1), (cognitive disorder, 4), (ma...","(Parkinson's disease, 12)"
541,lisdexamfetamine,indication,TransE,0.272727,"[(narcolepsy, 1), (cognitive disorder, 4), (ma...","(paranoid schizophrenia, 16.5)"
...,...,...,...,...,...,...
376,BOL-303259-X,indication,probCBR,0.353535,"[(glaucoma, 3), (angle-closure glaucoma, 4), (...","(neovascular glaucoma, 9)"
376,BOL-303259-X,indication,probCBR,0.353535,"[(glaucoma, 3), (angle-closure glaucoma, 4), (...","(secondary hypertension, 14.5)"
376,BOL-303259-X,indication,probCBR,0.353535,"[(glaucoma, 3), (angle-closure glaucoma, 4), (...","(pulmonary hypertension, 20)"
1059,Enasidenib,indication,TransE,0.313131,"[(Acute erythroid leukemia, 11), (mast-cell le...","(leukemia, 16)"


In [69]:
# if we threshold on 'overlap percentage' greater than 0.25, we get 312, which is only like 57 more than the amount we got previously (99 vs 156)
# remember got to divide the row by 2 since its TransE and probCBR
trans_pcbr_filt_human20[
    trans_pcbr_filt_human20["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).query(
    "overlap>=.25"
).to_csv(
    "./TransE_pCBR_Top_Overlaps.tsv", sep="\t", header=True, index=False
)

## What about top 5?

In [70]:
# 136 unique indications
trans_pcbr_filt_human5 = filter_mean_overlap_rank(trans_pcbr_filt_human, 5)

trans_pcbr_filt_human5[
    trans_pcbr_filt_human5["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "filt_preds","overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).sort_values(
    ["h", "algo"]
)

Unnamed: 0,h,r,algo,filt_preds,overlap,overlap_rank,filt_mean_overlap_rank
1042,"2-(1,3-dimethyl-2,6-dioxo-7-purinyl)acetic acid",indication,TransE,"[bronchus carcinoma in situ, bronchial mucus g...",0.101010,"[(pulmonary emphysema, 4), (bronchiectasis, 18...","(pulmonary emphysema, 2.5)"
363,"2-(1,3-dimethyl-2,6-dioxo-7-purinyl)acetic acid",indication,probCBR,"[pulmonary emphysema, atopic dermatitis, AR, C...",0.101010,"[(pulmonary emphysema, 1), (Rheumatoid arthrit...","(pulmonary emphysema, 2.5)"
988,"3-\{2-[4-(6-fluoro-1,2-benzoxazol-3-yl)piperid...",indication,TransE,"[generalized anxiety disorder, separation anxi...",0.111111,"[(generalized anxiety disorder, 1), (bipolar d...","(bipolar disorder, 5)"
322,"3-\{2-[4-(6-fluoro-1,2-benzoxazol-3-yl)piperid...",indication,probCBR,"[autistic disorder, major depressive disorder,...",0.111111,"[(autistic disorder, 1), (major depressive dis...","(bipolar disorder, 5)"
655,4-aminopyridine,indication,TransE,"[status epilepticus, neurodegenerative disease...",0.202020,"[(status epilepticus, 1), (chronic lymphocytic...","(status epilepticus, 2)"
...,...,...,...,...,...,...,...
218,topiramate,indication,probCBR,"[bipolar disorder, childhood absence epilepsy,...",0.212121,"[(bipolar disorder, 1), (childhood absence epi...","(childhood absence epilepsy, 2)"
218,topiramate,indication,probCBR,"[bipolar disorder, childhood absence epilepsy,...",0.212121,"[(bipolar disorder, 1), (childhood absence epi...","(cyclothymic disorder, 4)"
218,topiramate,indication,probCBR,"[bipolar disorder, childhood absence epilepsy,...",0.212121,"[(bipolar disorder, 1), (childhood absence epi...","(bipolar disorder, 4)"
549,trandolaprilat,indication,TransE,"[congestive heart failure, diabetic autonomic ...",0.101010,"[(congestive heart failure, 1), (hypertrophic ...","(congestive heart failure, 1)"


In [81]:
human_5_split_rank = trans_pcbr_filt_human5[
    trans_pcbr_filt_human5["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "filt_preds","overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).sort_values(
    ["h", "algo"]
)

human_5_split_rank['indication'] = human_5_split_rank.filt_mean_overlap_rank.apply(lambda x: x[0])
human_5_split_rank['mean_overlap_rank'] = human_5_split_rank.filt_mean_overlap_rank.apply(lambda x: x[1])

human_5_split_rank.head()

Unnamed: 0,h,r,algo,filt_preds,overlap,overlap_rank,filt_mean_overlap_rank,indication,mean_overlap_rank
1042,"2-(1,3-dimethyl-2,6-dioxo-7-purinyl)acetic acid",indication,TransE,"[bronchus carcinoma in situ, bronchial mucus g...",0.10101,"[(pulmonary emphysema, 4), (bronchiectasis, 18...","(pulmonary emphysema, 2.5)",pulmonary emphysema,2.5
363,"2-(1,3-dimethyl-2,6-dioxo-7-purinyl)acetic acid",indication,probCBR,"[pulmonary emphysema, atopic dermatitis, AR, C...",0.10101,"[(pulmonary emphysema, 1), (Rheumatoid arthrit...","(pulmonary emphysema, 2.5)",pulmonary emphysema,2.5
988,"3-\{2-[4-(6-fluoro-1,2-benzoxazol-3-yl)piperid...",indication,TransE,"[generalized anxiety disorder, separation anxi...",0.111111,"[(generalized anxiety disorder, 1), (bipolar d...","(bipolar disorder, 5)",bipolar disorder,5.0
322,"3-\{2-[4-(6-fluoro-1,2-benzoxazol-3-yl)piperid...",indication,probCBR,"[autistic disorder, major depressive disorder,...",0.111111,"[(autistic disorder, 1), (major depressive dis...","(bipolar disorder, 5)",bipolar disorder,5.0
655,4-aminopyridine,indication,TransE,"[status epilepticus, neurodegenerative disease...",0.20202,"[(status epilepticus, 1), (chronic lymphocytic...","(status epilepticus, 2)",status epilepticus,2.0


In [82]:
human_5_split_rank = human_5_split_rank.drop(columns = ['filt_mean_overlap_rank'])
human_5_split_rank.sort_values(by = ['mean_overlap_rank','h'], ascending = True)

Unnamed: 0,h,r,algo,filt_preds,overlap,overlap_rank,indication,mean_overlap_rank
875,Meclizine,indication,TransE,"[space motion sickness, obsolete diaper rash, ...",0.121212,"[(space motion sickness, 1), (allergic rhiniti...",space motion sickness,1.0
240,Meclizine,indication,probCBR,"[space motion sickness, angioedema, allergic r...",0.121212,"[(space motion sickness, 1), (allergic rhiniti...",space motion sickness,1.0
636,Milnacipran hydrochloride,indication,TransE,"[major depressive disorder, anaclitic depressi...",0.141414,"[(major depressive disorder, 1), (cerebrovascu...",major depressive disorder,1.0
73,Milnacipran hydrochloride,indication,probCBR,"[major depressive disorder, asthma, rheumatoid...",0.141414,"[(major depressive disorder, 1), (rheumatoid a...",major depressive disorder,1.0
740,dipivefrin hydrochloride,indication,TransE,"[ocular hypertension, obsolete glaucoma associ...",0.111111,"[(ocular hypertension, 1), (hypertension, 19),...",ocular hypertension,1.0
...,...,...,...,...,...,...,...,...
268,teduglutide,indication,probCBR,"[Crohn's disease, rheumatoid arthritis, ileiti...",0.141414,"[(Crohn's disease, 1), (rheumatoid arthritis, ...",ileitis,5.0
887,teriflunomide,indication,TransE,"[multiple endocrine neoplasia type 4, chronic ...",0.262626,"[(chronic lymphocytic leukemia, 2), (megakaryo...",hairy cell leukemia,5.0
251,teriflunomide,indication,probCBR,"[megakaryocytic leukemia, hairy cell leukemia,...",0.262626,"[(megakaryocytic leukemia, 1), (hairy cell leu...",hairy cell leukemia,5.0
1048,tolnaftate,indication,TransE,"[cutaneous candidiasis, obsolete tinea, vulvov...",0.454545,"[(cutaneous candidiasis, 1), (obsolete tinea, ...",Onychomycosis,5.0


In [83]:
human_5_split_rank.sort_values(by = ['mean_overlap_rank','h'], ascending = True).to_csv('./top_5_mean_overlap_rank.tsv', sep = '\t', header = True, index = False)

In [84]:
human_5_split_rank.sort_values(by = ['mean_overlap_rank','h'],ascending =True).query('mean_overlap_rank==1')

Unnamed: 0,h,r,algo,filt_preds,overlap,overlap_rank,indication,mean_overlap_rank
875,Meclizine,indication,TransE,"[space motion sickness, obsolete diaper rash, ...",0.121212,"[(space motion sickness, 1), (allergic rhiniti...",space motion sickness,1.0
240,Meclizine,indication,probCBR,"[space motion sickness, angioedema, allergic r...",0.121212,"[(space motion sickness, 1), (allergic rhiniti...",space motion sickness,1.0
636,Milnacipran hydrochloride,indication,TransE,"[major depressive disorder, anaclitic depressi...",0.141414,"[(major depressive disorder, 1), (cerebrovascu...",major depressive disorder,1.0
73,Milnacipran hydrochloride,indication,probCBR,"[major depressive disorder, asthma, rheumatoid...",0.141414,"[(major depressive disorder, 1), (rheumatoid a...",major depressive disorder,1.0
740,dipivefrin hydrochloride,indication,TransE,"[ocular hypertension, obsolete glaucoma associ...",0.111111,"[(ocular hypertension, 1), (hypertension, 19),...",ocular hypertension,1.0
137,dipivefrin hydrochloride,indication,probCBR,"[ocular hypertension, hypertension, epilepsy, ...",0.111111,"[(ocular hypertension, 1), (hypertension, 2), ...",ocular hypertension,1.0
854,lansoprazole,indication,TransE,"[peptic esophagitis, obsolete dyspepsia, Patho...",0.090909,"[(peptic esophagitis, 1), (obsolete dyspepsia,...",peptic esophagitis,1.0
223,lansoprazole,indication,probCBR,"[peptic esophagitis, obsolete dyspepsia, dysch...",0.090909,"[(peptic esophagitis, 1), (obsolete dyspepsia,...",peptic esophagitis,1.0
868,lorcaserin,indication,TransE,"[childhood obesity, Wilson-Turner syndrome, sc...",0.070707,"[(childhood obesity, 1), (Wilson-Turner syndro...",childhood obesity,1.0
234,lorcaserin,indication,probCBR,"[childhood obesity, Wilson-Turner syndrome, me...",0.070707,"[(childhood obesity, 1), (Wilson-Turner syndro...",childhood obesity,1.0


In [86]:
human_5_split_rank.sort_values(by = ['mean_overlap_rank','h'],ascending =True).query('mean_overlap_rank>1.5')

Unnamed: 0,h,r,algo,filt_preds,overlap,overlap_rank,indication,mean_overlap_rank
655,4-aminopyridine,indication,TransE,"[status epilepticus, neurodegenerative disease...",0.202020,"[(status epilepticus, 1), (chronic lymphocytic...",status epilepticus,2.0
88,4-aminopyridine,indication,probCBR,"[major depressive disorder, acute promyelocyti...",0.202020,"[(major depressive disorder, 1), (status epile...",status epilepticus,2.0
938,"5-methoxy-2-\{[(4-methoxy-3,5-dimethylpyridin-...",indication,TransE,"[obsolete dyspepsia, L-ethionine, granulomatou...",0.090909,"[(obsolete dyspepsia, 1), (cystic fibrosis, 14...",obsolete dyspepsia,2.0
290,"5-methoxy-2-\{[(4-methoxy-3,5-dimethylpyridin-...",indication,probCBR,"[shigellosis, gastritis, obsolete dyspepsia, s...",0.090909,"[(gastritis, 2), (obsolete dyspepsia, 3), (per...",obsolete dyspepsia,2.0
1064,Ferric Carboxymaltose,indication,TransE,"[hemolytic anemia, obsolete congenital anemia,...",0.121212,"[(hemolytic anemia, 1), (pancytopenia, 3), (an...",hemolytic anemia,2.0
...,...,...,...,...,...,...,...,...
268,teduglutide,indication,probCBR,"[Crohn's disease, rheumatoid arthritis, ileiti...",0.141414,"[(Crohn's disease, 1), (rheumatoid arthritis, ...",ileitis,5.0
887,teriflunomide,indication,TransE,"[multiple endocrine neoplasia type 4, chronic ...",0.262626,"[(chronic lymphocytic leukemia, 2), (megakaryo...",hairy cell leukemia,5.0
251,teriflunomide,indication,probCBR,"[megakaryocytic leukemia, hairy cell leukemia,...",0.262626,"[(megakaryocytic leukemia, 1), (hairy cell leu...",hairy cell leukemia,5.0
1048,tolnaftate,indication,TransE,"[cutaneous candidiasis, obsolete tinea, vulvov...",0.454545,"[(cutaneous candidiasis, 1), (obsolete tinea, ...",Onychomycosis,5.0


In [71]:
# unique compounds
len(set(trans_pcbr_filt_human5[
    trans_pcbr_filt_human5["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).sort_values(
    ["h", "algo"]
).h))

116

## Get medians

In [65]:
#
trans_pcbr_filt_human20[
    trans_pcbr_filt_human20["filt_mean_overlap_rank"].apply(lambda x: len(x)) > 0
][["h", "r", "algo", "overlap", "overlap_rank", "filt_mean_overlap_rank"]].explode(
    "filt_mean_overlap_rank"
).query(
    "overlap>.25"
).overlap.median()

0.3