# Making Knowledge Graph Embedding predictions overview
* [Rivas et al](https://academic.oup.com/bib/article/23/6/bbac481/6831005) ran KGEM hyperparameter optimizations on a miniturized BioKG and OpenBiolink (OBL) dataset
* this notebook aims to highlight the general process of extracting and running the generated models in the paper
* there are three sections: training a model using the optimized parameters, loading a model and making predictions. 

In [1]:
import pykeen
import os
import gc

import json
import torch
import pandas as pd
import polars as pl
from pykeen.triples import TriplesFactory
from pykeen.pipeline import pipeline

# Training a model using the optimized parameters

* If you want to train your own model using pykeen, and you have already run hyperparameter optimizations, see the following code cells

## Read in optimized parameters for TransE

In [33]:
# location of the downloaded hyperparameters from rivas et al paper
with open(
    "/home/rogertu/projects/KGEM/models/biokg/transe/0000_user_data_transe/best_pipeline/pipeline_config.json",
    "r",
) as f:
    configs = json.load(f)

In [34]:
# sample of what their configs look like. JSON of metadata and pipeline values
configs

{'metadata': {'_stopper_comment': 'While the original config had 300, early stopping will now switch it to 110',
  '_stopper_kwargs_removed_comment': "stopper_kwargs config removed after HPO: {'frequency': 10, 'patience': 3, 'relative_delta': 0.002}",
  'best_trial_evaluation': 0.14579331540530158,
  'best_trial_number': 17,
  'git_hash': 'UNHASHED',
  'version': '1.8.0'},
 'pipeline': {'dataset_kwargs': {'create_inverse_triples': False},
  'evaluation_kwargs': {'batch_size': None},
  'evaluator': 'rankbased',
  'evaluator_kwargs': {'filtered': True},
  'filter_validation_when_testing': True,
  'loss': 'bcewithlogits',
  'model': 'transe',
  'model_kwargs': {'embedding_dim': 48, 'scoring_fct_norm': 1},
  'optimizer': 'adam',
  'optimizer_kwargs': {'lr': 0.004722695778102846},
  'testing': '/opt/ml/processing/input/test.tsv',
  'training': '/opt/ml/processing/input/train.tsv',
  'training_kwargs': {'batch_size': 1232, 'num_epochs': 110},
  'training_loop': 'lcwa',
  'validation': '/opt/

In [35]:
# we just want the pipeline
pipeline_configs = configs["pipeline"]

In [42]:
# add checkpoint name so we can load the model later
pipeline_configs["training_kwargs"].update(
    {"checkpoint_name": "biokg_transe_checkpoint.pt", "checkpoint_frequency": 1}
)
# update train/test/valid paths

biokg_base = "/home/rogertu/projects/KGEM/data/biokg"
pipeline_configs.update(
    {
        "training": os.path.join(biokg_base, "train.tsv"),
        "testing": os.path.join(biokg_base, "test.tsv"),
        "validation": os.path.join(biokg_base, "valid.tsv"),
    }
)

In [43]:
# check new configs
pipeline_configs

{'dataset_kwargs': {'create_inverse_triples': False},
 'evaluation_kwargs': {'batch_size': None},
 'evaluator': 'rankbased',
 'evaluator_kwargs': {'filtered': True},
 'filter_validation_when_testing': True,
 'loss': 'bcewithlogits',
 'model': 'transe',
 'model_kwargs': {'embedding_dim': 48, 'scoring_fct_norm': 1},
 'optimizer': 'adam',
 'optimizer_kwargs': {'lr': 0.004722695778102846},
 'testing': '/home/rogertu/projects/KGEM/data/biokg/test.tsv',
 'training': '/home/rogertu/projects/KGEM/data/biokg/train.tsv',
 'training_kwargs': {'batch_size': 1232,
  'num_epochs': 100,
  'checkpoint_name': 'biokg_transe_checkpoint.pt',
  'checkpoint_frequency': 5},
 'training_loop': 'lcwa',
 'validation': '/home/rogertu/projects/KGEM/data/biokg/valid.tsv'}

## Train the model using the predefined configurations

In [None]:
# runs the pykeen training pipeline to build a model for the mini-biokg dataset (as seen in the rivas paper)
res = pipeline(**pipeline_configs)

# Load prior trained model
* the following is a template of how to run predictions for already trained models
* not guaranteed to run, because you need to have the models already created. The code was copied from a different notebook that ran
* two steps: load the dataset/model, and make predictions

## load the dataset and model

In [47]:
# load the datasets
train = TriplesFactory.from_path(os.path.join(biokg_base, "train.tsv"), delimiter="\t")
test = TriplesFactory.from_path(os.path.join(biokg_base, "test.tsv"), delimiter="\t")
valid = TriplesFactory.from_path(os.path.join(biokg_base, "valid.tsv"), delimiter="\t")

In [None]:
# initialize model parameters
pykeen_model = pykeen.models.TransE(  # pick the model that you had trained
    triples_factory=train,
    # model configuration
    embedding_dim=pipeline_configs["model_kwargs"]["embedding_dim"],
    scoring_fct_norm=pipeline_configs["model_kwargs"]["scoring_fct_norm"],
)

# load the checkpoint. This directory for me was at ~./data/pykeen/checkpoints
model_checkpoint = torch.load(
    pykeen.constants.PYKEEN_CHECKPOINTS.joinpath("biokg_transe_checkpoint.pt")
)
# attach state to model
pykeen_model.load_state_dict(model_checkpoint["state_dict"])

## Make predictions using self-trained model

In [49]:
# inspect test
test.tensor_to_df(test.mapped_triples).head()

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label
0,0,drugbank:DB00002,0,treats,1319,mesh:D000077195
1,1,drugbank:DB00006,0,treats,2243,mesh:D020521
2,1,drugbank:DB00006,0,treats,2286,mesh:D048909
3,2,drugbank:DB00007,0,treats,1930,mesh:D011629
4,3,drugbank:DB00008,0,treats,1344,mesh:D000740


In [None]:
# make a prediction dataframe for a given entry. Will make a ranked df of all entities possible in your dataset
# ordered from most likely to least likely (as thought by the model)
pred_df = pykeen.predict.predict_target(
    model=pykeen_model, triples_factory=test, head=test[0][0], relation=test[0][1]
)

# Load pre-trained model
* models come from rivas' paper
* extract the files in the artifacts by using gunzip
* two main steps: load the pre-trained model, then make predictions

In [100]:
# we're going to need to first construct a Mapping[str, int] (aka a dictionary) for entities and relations passed to the model
# these are found in the model directory and kindly provided by the authors (nice of them)

# path to best model
biokg_model = os.path.join(
    "/home/rogertu/projects/KGEM/models/biokg/transe/0000_user_data_transe/artifacts",
    str(configs["metadata"]["best_trial_number"]),
)

# dataframes for entity and relation mappings
e2id = pd.read_csv(
    os.path.join(biokg_model, "training_triples", "entity_to_id.tsv"), sep="\t"
)
r2id = pd.read_csv(
    os.path.join(biokg_model, "training_triples", "relation_to_id.tsv"), sep="\t"
)

# dictionary of e2id and r2id

e2id_dict = dict(zip(e2id["label"], e2id["id"]))
r2id_dict = dict(zip(r2id["label"], r2id["id"]))
id2e_dict = dict(zip(e2id["id"], e2id["label"]))
id2r_dict = dict(zip(r2id["id"], r2id["label"]))

In [98]:
# create a dataframe with the translated training triples, just so we can see what the model is predicting
# pl is polars, a faster version of pandas
(
    pl.read_csv(  # read dataframe
        os.path.join(biokg_model, "training_triples", "numeric_triples.tsv"),
        separator="\t",
        entity_to_id=e2id_dict,
    )
    .with_columns(  # convert numeric to string
        pl.col("head").replace(id2e_dict),
        pl.col("relation").replace(id2r_dict),
        pl.col("tail").replace(id2e_dict),
    )
    .write_csv(  # export to translated_triples.tsv
        os.path.join(biokg_model, "training_triples", "translated_triples.tsv"),
        separator="\t",
    )
)

In [104]:
# a model was already saved with the repository.
# we can load the model and use it to predict on the test set
# we however do need to know what the id2e and id2r mappings are otherwise predictions are non-sensical

model_checkpoint = torch.load(
    os.path.join(
        biokg_model,
        "trained_model.pkl",
    )
)

  model_checkpoint = torch.load(


## Make predictions
* score the likelihood of a given triple
* predict target given head/rel or rel/tail

### Predict triples

In [114]:
from pykeen.predict import predict_triples

In [121]:
predict_triples(
    model=paper_pk_model,
    triples=TriplesFactory.from_path(
        os.path.join(biokg_model, "training_triples", "translated_triples.tsv"),
        delimiter="\t",
        entity_to_id=e2id_dict,
        relation_to_id=r2id_dict,
    ),
)



ScorePack(result=tensor([[    0,     0, 10859],
        [    0,     1,  2416],
        [    0,     1,  2581],
        ...,
        [44412,     0,  3352],
        [44416,     0,  5246],
        [44417,     0,  3889]]), scores=tensor([-11.3291, -11.0540,  -9.8589,  ..., -10.0577, -10.9170,  -8.7985]))

### predict target

In [122]:
from pykeen.predict import predict_target

In [123]:
test.tensor_to_df(test.mapped_triples).head()

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label
0,0,drugbank:DB00002,0,treats,1319,mesh:D000077195
1,1,drugbank:DB00006,0,treats,2243,mesh:D020521
2,1,drugbank:DB00006,0,treats,2286,mesh:D048909
3,2,drugbank:DB00007,0,treats,1930,mesh:D011629
4,3,drugbank:DB00008,0,treats,1344,mesh:D000740


In [126]:
# make one prediction from the test set
a_tail_predict = predict_target(
    model=paper_pk_model,
    head="drugbank:DB00002",
    relation="treats",
    triples_factory=TriplesFactory.from_path(
        os.path.join(biokg_model, "training_triples", "translated_triples.tsv"),
        delimiter="\t",
        entity_to_id=e2id_dict,
        relation_to_id=r2id_dict,
    ),
)



In [None]:
# Use pandas to identify the true target for the head/rel pair. The rank here is 22,233 given the TransE model loaded.
a_tail_predict.df.reset_index()[["tail_id", "score", "tail_label"]].query(
    'tail_label=="mesh:D000077195"'
)

Unnamed: 0,tail_id,score,tail_label
22233,2338,-9.570178,mesh:D000077195
