In [1]:
import os
import re
import statistics
import pickle
import itertools
import json
from collections import defaultdict

import scipy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import upsetplot
from upsetplot import UpSet, from_contents
from tqdm import tqdm

import score_utils

# Get Intersection top non-indication prediction`
* put them all together so we can curate the results

## Import Data

### Import pCBR

In [2]:
# Import probCBR res
def json_to_df(json_dir: str) -> pd.DataFrame:
    """
    Takes a string to a json object and turns it into a dataframe
    """
    # import the json object
    with open(json_dir, "r") as f:
        json_obj = json.load(f)

    # creates a dict with json keys as keys and values of emptylist based on the first json entry
    json_key_dict = {i: [] for i in list(json_obj[0].keys())}
    for i in json_obj:
        for j in json_key_dict.keys():
            # add j key in json item i to the list
            to_add = json_key_dict[j]
            to_add.append(i[j])
            json_key_dict.update({j: to_add})

    return pd.DataFrame(json_key_dict)

In [3]:
pcbr_df = json_to_df("/home/msinha/Open-BIo-Link/data1.json")
pcbr_df.head(2)

Unnamed: 0,e1,r,answers,predicted_answers
0,CHEBI:135735,indication,[DOID:10763],"[DOID:6432, DOID:6000, DOID:10763, DOID:10591,..."
1,DOID:10763,indication_inv,"[CHEBI:135735, CHEBI:135738, CHEBI:141521, CHE...","[CHEBI:31548, CHEBI:46632, CHEBI:5784, CHEBI:5..."


In [4]:
def process_dataframe(
    df: pd.DataFrame,
    data_dir: str,
    model_dir: str,
    mode: str = "tail-batch",
    rel=[
        "indication",
        "indication_CiD",
    ],
):
    """
    Gets True tail triples
    """

    # get true head/tail triples from graph
    raw = score_utils.ProcessOutput(
        data_dir=data_dir,
        scores_outfile=os.path.join(model_dir, "test_scores.tsv"),
        mode=mode,
    )
    graph = raw.get_true_targets()
    graph = graph.query("r in @rel")

    if mode == "tail-batch":
        # get true tail triples from graph

        merged = pd.merge(
            left=df[["e1", "r", "answers", "predicted_answers"]].rename(
                columns={"e1": "h"}
            ),
            right=graph,
            how="left",
            on=["h", "r"],
        )
        merged = merged.rename(columns={"t": "true_t"})

        # ensure answers is in 'true_t'. First remove all predicted answers with no predictions
        merged = merged[merged["predicted_answers"].apply(lambda x: len(x) > 0)]
        # add answers to true t
        merged["true_t"] = [
            list(set(val) | set(merged.answers.iloc[ind]))
            if type(val) != float
            else list(set(merged.answers.iloc[ind]))
            for ind, val in enumerate(merged.true_t)
        ]

    elif mode == "head-batch":
        # get True head triples from graph
        merged = pd.merge(
            left=df[["e1", "r", "answers", "predicted_answers"]].rename(
                columns={"e1": "t"}
            ),
            right=graph,
            how="left",
            on=["r", "t"],
        )
        merged = merged.rename(columns={"t": "true_h", "h": "t"})

        # ensure answers is in 'true_t'. First remove all predicted answers with no predictions
        merged = merged[merged["predicted_answers"].apply(lambda x: len(x) > 0)]
        # add answers to true t
        merged["true_h"] = [
            list(set(val) | set(merged.answers.iloc[ind]))
            if type(val) != float
            else list(set(merged.answers.iloc[ind]))
            for ind, val in enumerate(merged.true_h)
        ]

    else:
        raise ValueError(f'{batch} is not in {"head-batch","tail-batch"}')
    return merged


def flatten_list(a_ls: list) -> list:
    """
    given a list of list, flatten to a 1D list
    """
    return_ls = list()
    for i in a_ls:
        min_ls = list()
        for j in i:
            if type(j) == list:
                for k in j:
                    min_ls.append(k)
            else:
                min_ls.append(j)
        min_ls = list(set(min_ls))
        return_ls.append(min_ls)

    return return_ls


# ensure answers are in true_t


def add_list_to_list(ls1: list, ls2: list) -> list:
    """
    Given two lists of lists, add list from ls1 to ls2 at the same index
    ls1: list to add to another list
    ls2: list to extend
    """

    for i, val in enumerate(ls1):
        ls2_item = ls2[i]
        ls2_item.extend(val)
        ls2[i] = list(set(ls2_item))

    return ls2

In [5]:
## get indications only
pcbr_df = process_dataframe(
    df=pcbr_df,
    data_dir="../data/MIND_CtD/",
    model_dir="../models/TransE_MIND_CtD_megha",
)

# note that true_t are all unique answers
pcbr_df = pcbr_df.query('r=="indication"')

#### pCBR DataFrame

In [6]:
# flatten true_t
pcbr_df.true_t = flatten_list(pcbr_df.true_t)
pcbr_df.true_t = add_list_to_list(list(pcbr_df.answers), list(pcbr_df.true_t))
pcbr_df["algo"] = "pCBR"

pcbr_df.head(2)

Unnamed: 0,h,r,answers,predicted_answers,true_t,algo
0,CHEBI:135735,indication,[DOID:10763],"[DOID:6432, DOID:6000, DOID:10763, DOID:10591,...","[DOID:10591, DOID:11130, DOID:10825, DOID:1076...",pCBR
2,CHEBI:135738,indication,[DOID:10763],"[DOID:10763, DOID:446, DOID:10591, DOID:10824,...","[DOID:10591, DOID:11130, DOID:10825, DOID:1076...",pCBR


### Import CBR

In [7]:
cbr_df = json_to_df("/home/msinha/CBR-AKBC/results/data_CBRonMIND_CtD.json")
cbr_df = process_dataframe(
    df=cbr_df, data_dir="../data/MIND_CtD", model_dir="../models/TransE_MIND_CtD_megha/"
)
# ensure answers are in true_t
cbr_df.true_t = flatten_list(list(cbr_df.true_t))
cbr_df.true_t = add_list_to_list(list(cbr_df.answers), list(cbr_df.true_t))
cbr_df["algo"] = "CBR"
cbr_df.head(2)

Unnamed: 0,h,r,answers,predicted_answers,true_t,algo
0,CHEBI:135735,indication,[DOID:10763],"[DOID:3393, DOID:6000, DOID:10763, DOID:3683, ...","[DOID:10591, DOID:11130, DOID:10825, DOID:1076...",CBR
1,CHEBI:135738,indication,[DOID:10763],"[HP:0000006, HP:0000007, HP:0001425, HP:000142...","[DOID:10591, DOID:11130, DOID:10825, DOID:1076...",CBR


### Import Rephetio Results

In [8]:
raw = score_utils.ProcessOutput(
    data_dir="../data/MIND_CtD/",
    scores_outfile=os.path.join("../models/TransE_MIND_CtD_megha/", "test_scores.tsv"),
    mode="tail-batch",
)
known_triples = raw.get_true_targets()
known_triples = known_triples.query("r == 'indication' or r == 'indication_CiD'")

# get unique chemicalsubstances and group the items togeher
rephetio = (
    pd.read_csv(
        "../../MechRepoNet/1_code/KG_reasoning_comparison/Rephetio_MIND_CtD_test/results.csv"
    )
    .sort_values(by="proba", ascending=False)
    .groupby("chemicalsubstance_id")
    .agg(lambda x: list(x))
    .reset_index()
    .rename(columns={"chemicalsubstance_id": "h", "disease_id": "predicted_answers"})[
        ["h", "predicted_answers", "proba"]
    ]
)

rephetio["algo"] = "Rephetio"

# merge known answers with rephetio to get a list of known answers per chemical compound. Just formatting
rephetio = pd.merge(left=rephetio, right=known_triples, on="h", how="left").rename(
    columns={"t": "answers"}
)[["h", "predicted_answers", "answers", "proba", "algo"]]

rephetio.head(2)

Unnamed: 0,h,predicted_answers,answers,proba,algo
0,CHEBI:135735,"[DOID:10763, DOID:5844, DOID:10825, DOID:2559,...","[DOID:10591, DOID:10824, DOID:10825, DOID:1113...","[0.6733899229161452, 0.1993276939787308, 0.091...",Rephetio
1,CHEBI:135738,"[DOID:10763, DOID:0060224, DOID:10825, DOID:58...","[DOID:10591, DOID:10824, DOID:10825, DOID:1113...","[0.9325203250946036, 0.780770655712133, 0.5917...",Rephetio


In [9]:
# limit predicted answers to top 100
rephetio["predicted_answers"] = rephetio["predicted_answers"].apply(lambda x: x[0:100])
rephetio.predicted_answers.apply(lambda x: len(x)).describe()

count    374.0
mean     100.0
std        0.0
min      100.0
25%      100.0
50%      100.0
75%      100.0
max      100.0
Name: predicted_answers, dtype: float64

### Import KGE Results

In [10]:
kge_ls = list()
for i in ["TransE", "DistMult", "ComplEx", "RotatE"]:
    print(i)
    raw = score_utils.ProcessOutput(
        data_dir="../data/MIND_CtD",
        scores_outfile=f"../models/{i}_MIND_CtD_megha/test_scores.tsv",
        mode="tail-batch",
    )
    raw.format_raw_scores_to_df()
    raw.translate_embeddings(direction="from")

    df = raw.filter_predictions(top=100).query('batch=="tail-batch"')
    # limit to top 100
    df["preds"] = df["preds"].apply(lambda x: x[0:100])
    df["algo"] = i
    kge_ls.append(df)
    print(f"Shape: {df.shape}")

TransE
Shape: (537, 7)
DistMult
Shape: (537, 7)
ComplEx
Shape: (537, 7)
RotatE
Shape: (537, 7)


In [11]:
kge_df = pd.concat(kge_ls)
kge_df.head(2)

Unnamed: 0,h,r,preds,batch,true_t,filt_preds,algo
537,CHEBI:135735,indication,"[CHEBI:135735, DOID:9654, UMLS:C0221155, WD:Q2...",tail-batch,"[DOID:10591, DOID:10824, DOID:10825, DOID:1113...","[DOID:9654, UMLS:C0221155, WD:Q25303605, MONDO...",TransE
538,CHEBI:135738,indication,"[CHEBI:135738, DOID:10763, DOID:9654, WD:Q2530...",tail-batch,"[DOID:10591, DOID:10824, DOID:10825, DOID:1113...","[DOID:9654, WD:Q25303605, UMLS:C0221155, CHEBI...",TransE


In [12]:
# Drop duplicate compounds, all predictions are the same given compound and model
print(f"Before duplicate removal: {kge_df.shape}")
kge_df = kge_df.drop_duplicates(subset=["h", "r", "algo"])
print(f"After: {kge_df.shape}")

Before duplicate removal: (2148, 7)
After: (1548, 7)


## Unify headers and clean data

In [13]:
# cbr and rephetio don't yield results for some of the compound disease predictions
(
    len(
        set(pcbr_df["h"])
        .intersection(set(cbr_df["h"]))
        .intersection(set(rephetio["h"]))
        .intersection(set(kge_df.query('algo=="TransE"')["h"]))
    )
)

322

In [14]:
intersection_ls = list(
    set(pcbr_df["h"])
    .intersection(set(cbr_df["h"]))
    .intersection(set(rephetio["h"]))
    .intersection(set(kge_df.query('algo=="TransE"')["h"]))
)

cbr_df = cbr_df.explode(column="answers").query("h in @intersection_ls")
cbr_df.shape

(460, 6)

In [15]:
pcbr_df = pcbr_df.explode(column="answers").query("h in @intersection_ls")
pcbr_df.shape

(460, 6)

In [16]:
rephetio_df = rephetio[["h", "predicted_answers", "algo"]]

# reorganize the dataframe and merge in true_answer
rephetio_df = pd.merge(
    left=rephetio_df,
    right=cbr_df[["h", "r", "answers", "true_t"]],
    on="h",
    how="right",
)[["h", "r", "answers", "predicted_answers", "true_t", "algo"]]

rephetio_df.head(1)

Unnamed: 0,h,r,answers,predicted_answers,true_t,algo
0,CHEBI:135735,indication,DOID:10763,"[DOID:10763, DOID:5844, DOID:10825, DOID:2559,...","[DOID:10591, DOID:11130, DOID:10825, DOID:1076...",Rephetio


In [17]:
rephetio_df.shape

(460, 6)

In [18]:
# groupby cbr again..
cbr_df_groupby = cbr_df[["h", "answers"]].groupby("h").agg(list)

In [19]:
# reogranize kge df
kge_df = (
    pd.merge(left=kge_df, right=cbr_df_groupby, on="h", how="left")
    .query("h in @intersection_ls")[["h", "r", "answers", "preds", "true_t", "algo"]]
    .explode("answers")
)

In [20]:
# combine all df together
combined_df = pd.concat(
    [
        cbr_df,
        pcbr_df,
        rephetio_df,
        kge_df.rename(columns={"preds": "predicted_answers"}),
    ]
)
combined_df.shape

(3220, 6)

In [21]:
# Get a consolidated list of Trues
get_h2targets = (
    combined_df[["h", "true_t"]]
    .explode("true_t")
    .groupby("h")
    .agg(lambda x: list(set(x)))
    .reset_index()
)

In [22]:
# create a dict to relabel true_ts
new_true_dict = dict(zip(get_h2targets["h"], get_h2targets["true_t"]))

combined_df["true_t2"] = [
    new_true_dict[val] for i, val in enumerate(list(combined_df["h"]))
]
combined_df.shape

(3220, 7)

In [23]:
combined_df["t_equal_t2"] = [
    set(list(combined_df["true_t"])[i]).issubset(set(val))
    for i, val in enumerate(list(combined_df["true_t2"]))
]

In [24]:
combined_df.query("t_equal_t2==False")

Unnamed: 0,h,r,answers,predicted_answers,true_t,algo,true_t2,t_equal_t2


In [25]:
# reset the indices after concatinations
combined_df = combined_df.reset_index()[
    ["h", "r", "answers", "predicted_answers", "true_t", "algo"]
]

In [26]:
combined_df.head(2)

Unnamed: 0,h,r,answers,predicted_answers,true_t,algo
0,CHEBI:135735,indication,DOID:10763,"[DOID:3393, DOID:6000, DOID:10763, DOID:3683, ...","[DOID:10591, DOID:11130, DOID:10825, DOID:1076...",CBR
1,CHEBI:135738,indication,DOID:10763,"[HP:0000006, HP:0000007, HP:0001425, HP:000142...","[DOID:10591, DOID:11130, DOID:10825, DOID:1076...",CBR


## Get dataset intersections

#### Import interested compound list

In [28]:
int_comp_list = list(
    pd.read_csv("./top_predictions_7_intersection.csv", sep=",").compounds
)
len(int_comp_list)

25

#### separate trues from the predicted

In [37]:
def split_predict(predicted_answers: list, trues: list):
    """
    Returns two dictionaries of trues and predicted, and their respective ranks
    """
    true_dict = {
        true: predicted_answers.index(true) + 1
        for true in trues
        if true in predicted_answers
    }
    pred_dict = {pred: i + 1 for i, pred in enumerate(predicted_answers)}
    pred_dict = {k: v for k, v in pred_dict.items() if k not in trues}

    return true_dict, pred_dict

In [42]:
# seperate predictions to trues and predicts with their indices
combined_df = combined_df.assign(
    true_ranks=[
        split_predict(list(combined_df["predicted_answers"])[i], val)[0]
        for i, val in enumerate(combined_df["true_t"])
    ],
    pred_ranks=[
        split_predict(list(combined_df["predicted_answers"])[i], val)[1]
        for i, val in enumerate(combined_df["true_t"])
    ],
)
combined_df.head(2)

Unnamed: 0,h,r,answers,predicted_answers,true_t,algo,true_ranks,pred_ranks
0,CHEBI:135735,indication,DOID:10763,"[DOID:3393, DOID:6000, DOID:10763, DOID:3683, ...","[DOID:10591, DOID:11130, DOID:10825, DOID:1076...",CBR,"{'DOID:10591': 13, 'DOID:10763': 3, 'DOID:6432...","{'DOID:3393': 1, 'DOID:6000': 2, 'DOID:3683': ..."
1,CHEBI:135738,indication,DOID:10763,"[HP:0000006, HP:0000007, HP:0001425, HP:000142...","[DOID:10591, DOID:11130, DOID:10825, DOID:1076...",CBR,"{'DOID:10763': 8, 'DOID:6432': 99}","{'HP:0000006': 1, 'HP:0000007': 2, 'HP:0001425..."


In [44]:
# create a subset of the original dataframe of just the predictions and the compound
combined_df2 = combined_df.query("h in @int_comp_list")[["h", "pred_ranks", "algo"]]
combined_df2 = combined_df2.reset_index(drop=True)
combined_df2.head(2)

Unnamed: 0,h,pred_ranks,algo
4,CHEBI:135925,"{'DOID:60080': 1, 'DOID:60075': 2, 'DOID:1612'...",CBR
15,CHEBI:17439,"{'MESH:D008106': 1, 'DOID:50671': 2, 'DOID:600...",CBR


In [58]:
combined_df2["top_pred"] = combined_df2.pred_ranks.apply(lambda x: list(x.keys())[0])
combined_df2[["h", "top_pred", "algo"]].head(2)

In [60]:
combined_df2[["h", "top_pred", "algo"]].head(2)

Unnamed: 0,h,top_pred,algo
0,CHEBI:135925,DOID:60080,CBR
1,CHEBI:17439,MESH:D008106,CBR


#### Import in the dictionary names

In [61]:
nodes = pd.read_csv("../../../MRN_dataset/nodes_biolink.csv")
nodes_dict = dict(zip(nodes.id, nodes.name))

  nodes = pd.read_csv("../../../MRN_dataset/nodes_biolink.csv")


#### Translate the dataframe and export it

In [65]:
combined_df2.assign(
    compound=combined_df2.h.apply(lambda x: nodes_dict[x]),
    top_pred_dis=combined_df2.top_pred.apply(lambda x: nodes_dict[x]),
)[["h", "top_pred", "compound", "top_pred_dis", "algo"]]

Unnamed: 0,h,top_pred,compound,top_pred_dis,algo
0,CHEBI:135925,DOID:60080,lisdexamfetamine,"Advanced hormone receptor-positive, HER2­ nega...",CBR
1,CHEBI:17439,MESH:D008106,cyanocob(III)alamin,"Liver Cirrhosis, Experimental",CBR
2,CHEBI:31530,DOID:60164,edaravone,Pain relief,CBR
3,CHEBI:31854,DOID:9975,Milnacipran hydrochloride,cocaine dependence,CBR
4,CHEBI:3286,DOID:14330,cabergoline,Parkinson's disease,CBR
...,...,...,...,...,...
177,CHEBI:8708,DOID:6364,quetiapine fumarate,migraine,RotatE
178,CHEBI:9207,DOID:10763,sotalol hydrochloride,hypertension,RotatE
179,CHEBI:9711,DOID:0060001,triflupromazine,withdrawal disorder,RotatE
180,CHEBI:9725,DOID:9970,Trimeprazine,obesity,RotatE


In [64]:
combined_df2.assign(
    compound=combined_df2.h.apply(lambda x: nodes_dict[x]),
    top_pred_dis=combined_df2.top_pred.apply(lambda x: nodes_dict[x]),
)[["h", "top_pred", "compound", "top_pred_dis", "algo"]].to_csv(
    "top_predicted_individual_algo_intersect.csv", header=True, index=False
)