# Create a minimal version of MIND (MechRepoNet with Indications)

In a prior notebook, we found that another author created a minimal version of BioKG and OpenBiolink knowledge graphs by distilling only three node types, and up-to three relations (depending on the network). These minimizations of the knowledge graphs still had relatively good predictive capabilities, while keeping the memory footprint low. The final sizes of each of their modified graphs was 1/10 and 1/80th the original size of BioKG and OpenBiolink, respectively.

Let's create a minimal version of MIND following what was done above by keeping only Compounds, Genes/Gene Products and Diseases nodes, and associates_with, activates, inhibits, and treats edges. After processing, there remains 64,095 nodes and 1,101,475 edges. I created a minimal pykeen graph file, mini_MIND, to easily load the dataset in downstream knowledge graph embedding model training.

In [1]:
import polars as pl
import pandas as pd
import os

## Process the datasets

### load the data

In [None]:
nodes = pl.read_csv("/home/rogertu/MRN_dataset/nodes_biolink.csv", has_header=True)

In [6]:
nodes.head(2)

id,name,label,xrefs,source,synonyms,alt_ids,subsets
str,str,str,str,str,str,str,str
"""UBERON:0000002…","""cervix""","""AnatomicalEnti…","""MESH:D002584|U…",,,,
"""UBERON:0000004…","""human nose""","""AnatomicalEnti…","""MESH:D009666|U…",,,,


In [9]:
mind_triples = pl.read_csv(
    "/home/rogertu/projects/KGEM/data/MIND/graph.txt",
    separator="\t",
    has_header=False,
    new_columns=["head", "rel", "tail"],
)

In [10]:
mind_triples.head(2)

head,rel,tail
str,str,str
"""UNII:BTY153760…","""inhibits_CinG""","""NCBIGene:3605"""
"""CHEBI:10056""","""activates_CaG""","""NCBIGene:1129"""


### Add relevant information from nodes into the triples

In [15]:
edges.head(2)

start_id,end_id,type,dsrc_type,comp_type,p_val,adj_p,source,license,experiments,support_type,pmids,phase,date,name,name_x,name_y,merge_id,reactome_id,abbrev,type_no_abbv
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""UNII:BTY153760…","""NCBIGene:3605""","""inhibits_CinG""","""computed""","""merge""",,,"""WikiData""","""CC0 1.0""",,,,,,,,,,,"""CinG""","""inhibits"""
"""CHEBI:10056""","""NCBIGene:1129""","""activates_CaG""","""computed""","""merge""",,,"""WikiData""","""CC0 1.0""",,,,,,,,,,,"""CaG""","""activates"""


In [16]:
mind_triples = (
    mind_triples.join(nodes[["id", "label"]], left_on="head", right_on="id", how="left")
    .join(nodes[["id", "label"]], left_on="tail", right_on="id", how="left")
    .rename({"label": "h_label", "label_right": "t_label"})
)

In [17]:
mind_triples.head(2)

head,rel,tail,h_label,t_label
str,str,str,str,str
"""UNII:BTY153760…","""inhibits_CinG""","""NCBIGene:3605""","""ChemicalSubsta…","""Macromolecular…"
"""CHEBI:10056""","""activates_CaG""","""NCBIGene:1129""","""ChemicalSubsta…","""Macromolecular…"


### reconstruct the node and edge abbreviations

In [102]:
mind_triples_noind = (
    mind_triples.filter(pl.col("rel") != "indication")
    .with_columns(
        pl.col("rel")
        .str.extract(r"[a-z_]+", group_index=0)
        .str.strip_suffix("_")
        .alias("rel_no_abbv"),
        pl.col("rel").str.extract(r"[A-Z]+[a-z><]+[A-Z]+", group_index=0).alias("abbv"),
    )
    .with_columns(
        pl.col("abbv").str.extract(r"[a-z><]+", group_index=0).alias("rel_abbv"),
        pl.col("abbv").str.extract(r"[A-Z]+", group_index=0).alias("h_abbv"),
        pl.col("abbv").str.extract(r"[A-Z]+$", group_index=0).alias("t_abbv"),
    )
)

In [103]:
mind_triples_noind.head(2)

head,rel,tail,h_label,t_label,rel_no_abbv,abbv,rel_abbv,h_abbv,t_abbv
str,str,str,str,str,str,str,str,str,str
"""UNII:BTY153760…","""inhibits_CinG""","""NCBIGene:3605""","""ChemicalSubsta…","""Macromolecular…","""inhibits""","""CinG""","""in""","""C""","""G"""
"""CHEBI:10056""","""activates_CaG""","""NCBIGene:1129""","""ChemicalSubsta…","""Macromolecular…","""activates""","""CaG""","""a""","""C""","""G"""


In [104]:
mind_triples_noind.filter(pl.col("rel_abbv").is_null())

head,rel,tail,h_label,t_label,rel_no_abbv,abbv,rel_abbv,h_abbv,t_abbv
str,str,str,str,str,str,str,str,str,str


### create an abbreviation dictionary

#### create node dictionary

In [105]:
tmp_df = mind_triples_noind.unique("h_label").select(["h_label", "h_abbv"])
tmp_df

h_label,h_abbv
str,str
"""AnatomicalEnti…","""A"""
"""GeneFamily""","""F"""
"""Disease""","""D"""
"""BiologicalProc…","""BP"""
"""OrganismTaxon""","""T"""
"""Macromolecular…","""G"""
"""ChemicalSubsta…","""C"""
"""Pathway""","""PW"""


In [106]:
tmp_dict = dict(zip(tmp_df["h_label"], tmp_df["h_abbv"]))

In [107]:
tmp_dict

{'AnatomicalEntity': 'A',
 'GeneFamily': 'F',
 'Disease': 'D',
 'BiologicalProcessOrActivity': 'BP',
 'OrganismTaxon': 'T',
 'MacromolecularMachine': 'G',
 'ChemicalSubstance': 'C',
 'Pathway': 'PW'}

In [108]:
tmp_df = mind_triples_noind.unique("rel_no_abbv").select(["rel_no_abbv", "rel_abbv"])
tmp_df

rel_no_abbv,rel_abbv
str,str
"""in_reaction_wi…","""rx"""
"""treats""","""t"""
"""part_of""","""po"""
"""site_of""","""so"""
"""marker_or_mech…","""m"""
…,…
"""causes""","""c"""
"""has_input""","""hi"""
"""in_taxon""","""it"""
"""activates""","""a"""


#### create relation dictionary

In [109]:
tmp_dict2 = dict(zip(tmp_df["rel_no_abbv"], tmp_df["rel_abbv"]))

In [110]:
tmp_dict2

{'in_reaction_with': 'rx',
 'treats': 't',
 'part_of': 'po',
 'site_of': 'so',
 'marker_or_mechanism': 'm',
 'associated_with': 'aw',
 'negatively_regulates': 'nr',
 'regulates': 'r>',
 'positively_regulates': 'pr',
 'prevents': 'pv',
 'presents': 'ps',
 'inhibits': 'in',
 'produces': 'p',
 'palliates': 'pl',
 'capable_of': 'co',
 'disrupts': 'd',
 'causes': 'c',
 'has_input': 'hi',
 'in_taxon': 'it',
 'activates': 'a',
 'affects': 'af'}

In [111]:
tmp_dict2.values()

dict_values(['rx', 't', 'po', 'so', 'm', 'aw', 'nr', 'r>', 'pr', 'pv', 'ps', 'in', 'p', 'pl', 'co', 'd', 'c', 'hi', 'it', 'a', 'af'])

In [113]:
tmp_dict2.update({"indication": "i"})

In [114]:
tmp_dict.update(tmp_dict2)

In [115]:
tmp_dict

{'AnatomicalEntity': 'A',
 'GeneFamily': 'F',
 'Disease': 'D',
 'BiologicalProcessOrActivity': 'BP',
 'OrganismTaxon': 'T',
 'MacromolecularMachine': 'G',
 'ChemicalSubstance': 'C',
 'Pathway': 'PW',
 'in_reaction_with': 'rx',
 'treats': 't',
 'part_of': 'po',
 'site_of': 'so',
 'marker_or_mechanism': 'm',
 'associated_with': 'aw',
 'negatively_regulates': 'nr',
 'regulates': 'r>',
 'positively_regulates': 'pr',
 'prevents': 'pv',
 'presents': 'ps',
 'inhibits': 'in',
 'produces': 'p',
 'palliates': 'pl',
 'capable_of': 'co',
 'disrupts': 'd',
 'causes': 'c',
 'has_input': 'hi',
 'in_taxon': 'it',
 'activates': 'a',
 'affects': 'af',
 'indication': 'i'}

### Update the indications dataframe

In [116]:
mind_triples_ind = mind_triples.filter(pl.col("rel") == "indication")
mind_triples_ind.head()

head,rel,tail,h_label,t_label
str,str,str,str,str
"""CHEBI:32120""","""indication""","""REACT:R-HSA-21…","""ChemicalSubsta…","""Pathway"""
"""UBERON:0001132…","""indication""","""DOID:11199""","""AnatomicalEnti…","""Disease"""
"""CHEBI:10023""","""indication""","""DOID:0050289""","""ChemicalSubsta…","""Disease"""
"""CHEBI:100241""","""indication""","""DOID:12385""","""ChemicalSubsta…","""Disease"""
"""CHEBI:100241""","""indication""","""DOID:13258""","""ChemicalSubsta…","""Disease"""


In [117]:
mind_triples_noind.head(2)

head,rel,tail,h_label,t_label,rel_no_abbv,abbv,rel_abbv,h_abbv,t_abbv
str,str,str,str,str,str,str,str,str,str
"""UNII:BTY153760…","""inhibits_CinG""","""NCBIGene:3605""","""ChemicalSubsta…","""Macromolecular…","""inhibits""","""CinG""","""in""","""C""","""G"""
"""CHEBI:10056""","""activates_CaG""","""NCBIGene:1129""","""ChemicalSubsta…","""Macromolecular…","""activates""","""CaG""","""a""","""C""","""G"""


In [122]:
mind_triples_ind = (
    mind_triples_ind.rename({"rel": "rel_no_abbv"})
    .with_columns(
        pl.lit("i").alias("rel_abbv"),
        pl.col("h_label").replace(tmp_dict).alias("h_abbv"),
        pl.col("t_label").replace(tmp_dict).alias("t_abbv"),
    )
    .with_columns(
        (pl.col("h_abbv") + pl.col("rel_abbv") + pl.col("t_abbv")).alias("abbv"),
        (
            pl.col("rel_no_abbv")
            + "_"
            + pl.col("h_abbv")
            + pl.col("rel_abbv")
            + pl.col("t_abbv")
        ).alias("rel"),
    )[
        [
            "head",
            "rel",
            "tail",
            "h_label",
            "t_label",
            "rel_no_abbv",
            "abbv",
            "rel_abbv",
            "h_abbv",
            "t_abbv",
        ]
    ]
)

mind_triples_ind.head(2)

head,rel,tail,h_label,t_label,rel_no_abbv,abbv,rel_abbv,h_abbv,t_abbv
str,str,str,str,str,str,str,str,str,str
"""CHEBI:32120""","""indication_CiP…","""REACT:R-HSA-21…","""ChemicalSubsta…","""Pathway""","""indication""","""CiPW""","""i""","""C""","""PW"""
"""UBERON:0001132…","""indication_AiD…","""DOID:11199""","""AnatomicalEnti…","""Disease""","""indication""","""AiD""","""i""","""A""","""D"""


### Combine the dataframes back together

In [124]:
mind_triples2 = pl.concat([mind_triples_noind, mind_triples_ind])

## Filter the dataset like Rivas _et al_

### Filter nodes to only have compounds, diseases and genes

In [126]:
mind_triples2.head(2)

head,rel,tail,h_label,t_label,rel_no_abbv,abbv,rel_abbv,h_abbv,t_abbv
str,str,str,str,str,str,str,str,str,str
"""UNII:BTY153760…","""inhibits_CinG""","""NCBIGene:3605""","""ChemicalSubsta…","""Macromolecular…","""inhibits""","""CinG""","""in""","""C""","""G"""
"""CHEBI:10056""","""activates_CaG""","""NCBIGene:1129""","""ChemicalSubsta…","""Macromolecular…","""activates""","""CaG""","""a""","""C""","""G"""


In [132]:
filt_triples = mind_triples2.filter(
    pl.col("h_abbv").is_in(["G", "C", "D"]), pl.col("t_abbv").is_in(["G", "C", "D"])
)

### Get the unique relationship types

In [133]:
filt_triples.unique("rel_no_abbv").select("rel_no_abbv").to_series().to_list()

['palliates',
 'inhibits',
 'affects',
 'activates',
 'prevents',
 'positively_regulates',
 'negatively_regulates',
 'part_of',
 'treats',
 'indication',
 'regulates',
 'associated_with',
 'marker_or_mechanism',
 'in_reaction_with']

### Figure out the mappings using the biolink model
* map each of the edge types to 'treats', 'associated_with', 'activates', 'inhibits'
* *italicized* means that the relation shows up between multiple types, e.g. Gene-postively_regulates-Compound, Gene-positively_regulates-Gene
* mappings
    * Compound-Gene:
        * activates
        * inhibits
        * affects (remove)
        * *part_of* (remove)
    * Gene-Compound:
        * *positively_regulates*
        * *negatively_regulates*
        * in_reaction_with (remove)
        * *regulates* (remove)
    * Compound-Disease: 
        * palliates (merge w/ treats)
        * prevents (merge w/ treats)
        * treats
        * indication 
    * Gene-Gene:
        * *positively_regulates*
        * *negatively_regulates*
        * *regulates* (remove)
        * *part_of* (remove)
    * Gene-Disease: 
        * associated_with
        * marker_or_mechanism




In [172]:
filt_triples = (
    filt_triples.filter(
        pl.col("rel_no_abbv")
        .is_in(["affects", "regulates", "part_of", "in_reaction_with"])
        .not_()
    )
    .with_columns(
        pl.col("rel_no_abbv")
        .replace({"palliates": "treats", "prevents": "treats"})
        .alias("new_rel")
    )
    .unique(["head", "new_rel", "tail"])
)

In [173]:
filt_triples.shape

(1101475, 11)

In [174]:
filt_triples.unique("new_rel")

head,rel,tail,h_label,t_label,rel_no_abbv,abbv,rel_abbv,h_abbv,t_abbv,new_rel
str,str,str,str,str,str,str,str,str,str,str
"""NCBIGene:10125…","""associated_wit…","""DOID:0110750""","""Macromolecular…","""Disease""","""associated_wit…","""GawD""","""aw""","""G""","""D""","""associated_wit…"
"""CHEBI:100147""","""treats_CtD""","""WD:Q21109048""","""ChemicalSubsta…","""Disease""","""treats""","""CtD""","""t""","""C""","""D""","""treats"""
"""MESH:C001079""","""marker_or_mech…","""MESH:D000014""","""ChemicalSubsta…","""Disease""","""marker_or_mech…","""CmD""","""m""","""C""","""D""","""marker_or_mech…"
"""REACT:R-HSA-17…","""positively_reg…","""NCBIGene:4790""","""Macromolecular…","""Macromolecular…","""positively_reg…","""Gpr>G""","""pr>""","""G""","""G""","""positively_reg…"
"""UniProt:O15519…","""negatively_reg…","""NCBIGene:355""","""Macromolecular…","""Macromolecular…","""negatively_reg…","""Gnr>G""","""nr>""","""G""","""G""","""negatively_reg…"
"""CHEBI:135755""","""activates_CaG""","""NCBIGene:5737""","""ChemicalSubsta…","""Macromolecular…","""activates""","""CaG""","""a""","""C""","""G""","""activates"""
"""CHEBI:100241""","""indication_CiD…","""DOID:1679""","""ChemicalSubsta…","""Disease""","""indication""","""CiD""","""i""","""C""","""D""","""indication"""
"""CHEBI:3647""","""inhibits_CinG""","""NCBIGene:3363""","""ChemicalSubsta…","""Macromolecular…","""inhibits""","""CinG""","""in""","""C""","""G""","""inhibits"""


### Get number of unique nodes

In [179]:
filt_nodes = filt_triples["head"].append(filt_triples["tail"]).unique().to_list()

In [180]:
len(filt_nodes)

64095

## Export graph file

In [188]:
filt_triples.select(["head", "new_rel", "tail"]).write_csv(
    file="/home/rogertu/projects/KGEM/data/mini_MIND/graph.tsv",
    include_header=False,
    separator="\t",
)

## Create PyKEEN dataset object

In [181]:
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline

In [189]:
tf = TriplesFactory.from_path(
    path="/home/rogertu/projects/KGEM/data/mini_MIND/graph.tsv",
    create_inverse_triples=True,
    delimiter="\t",
)

In [190]:
train, test, valid = tf.split(
    ratios=[0.8, 0.1, 0.1],
    random_state=42,
)

In [191]:
train

TriplesFactory(num_entities=64095, num_relations=16, create_inverse_triples=True, num_triples=881180, path="/home/rogertu/projects/KGEM/data/mini_MIND/graph.tsv")

In [194]:
train.relation_id_to_label

{0: 'activates',
 1: 'associated_with',
 2: 'indication',
 3: 'inhibits',
 4: 'marker_or_mechanism',
 5: 'negatively_regulates',
 6: 'positively_regulates',
 7: 'treats'}

### Create a test pipeline

In [None]:
res = pipeline(
    training=train,
    testing=test,
    validation=valid,
    model="TransE",
    model_kwargs=dict(embedding_dim=50),
    training_kwargs=dict(num_epochs=2),
    evaluation_kwargs=dict(
        restrict_relations_to=[2, 7], pre_filtered_triples=False
    ),  # only test on "treats" and "indication"
    random_seed=239979851,
)

In [201]:
res.metric_results.metrics.get("mean_rank")

In [210]:
res.get_metric(key="both.mean_rank")

3570.445556640625

### Make predictions

In [212]:
from pykeen.predict import predict_target

In [220]:
test.tensor_to_df(tensor=test.mapped_triples)

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label
0,7010,CHEBI:85991,3,inhibits,56968,NCBIGene:7468
1,6238,CHEBI:7790,0,activates,33003,NCBIGene:10897
2,6091,CHEBI:75873,3,inhibits,49767,NCBIGene:4082
3,62591,REACT:R-HSA-9012172,5,negatively_regulates,62784,RNAC:URS000023C301_9606
4,28747,MESH:D009930,4,marker_or_mechanism,13403,DOID:14667
...,...,...,...,...,...,...
110142,3763,CHEBI:38319,3,inhibits,54211,NCBIGene:596
110143,1356,CHEBI:16842,0,activates,49776,NCBIGene:4090
110144,53619,NCBIGene:5743,4,marker_or_mechanism,14182,DOID:332
110145,5370,CHEBI:641,3,inhibits,35874,NCBIGene:165215


In [221]:
predict_target(model=res.model, head=7010, relation=2, triples_factory=train)

TargetPredictions(df=       tail_id      score     tail_label
13691    13691  -6.272489      DOID:2150
12261    12261  -6.474322     DOID:10825
14688    14688  -6.646122      DOID:4478
15592    15592  -6.695927      DOID:7148
14926    14926  -6.818472     DOID:50338
...        ...        ...            ...
3414      3414 -14.438444     CHEBI:3485
5078      5078 -14.459141    CHEBI:59662
24482    24482 -14.530169   MESH:C095424
47828    47828 -14.544150  NCBIGene:3456
4844      4844 -14.633337    CHEBI:53675

[64095 rows x 3 columns], factory=TriplesFactory(num_entities=64095, num_relations=16, create_inverse_triples=True, num_triples=881180, path="/home/rogertu/projects/KGEM/data/mini_MIND/graph.tsv"), target='tail', other_columns_fixed_ids=(7010, 2))