# Generate predictions for WN18RR
Generate predictions for WN18RR and store the as a collated dataframe for a set of given predictions.

In [1]:
import pykeen
import pykeen.datasets
import pykeen.models
import pykeen.predict
import pandas as pd
import numpy as np
import pickle
import os
import polars as pl
import torch

## Load in each model

### Load dataset

In [2]:
dataset = pykeen.datasets.get_dataset(dataset="WN18RR")

  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  metadata = torch.load(metadata_path) if metadata_path.is_file() else None


### Load TransE model

In [3]:
# load transe model
transe_model = pykeen.models.TransE(
    triples_factory=dataset.training,
    embedding_dim=500,
    scoring_fct_norm=2,
    random_seed=246343514,  # use same seed as training otherwise model and chkpt train/test split will be different
)

# load chkpt
transe_chkpt = torch.load(
    pykeen.constants.PYKEEN_CHECKPOINTS.joinpath(
        "TransE_WN18RR.pt"
    ),  # accidentally deleted original. rm is sometimes dangerous
)
# attach state to model
transe_model.load_state_dict(transe_chkpt["model_state_dict"])

  transe_chkpt = torch.load(


<All keys matched successfully>

### Load RotatE model

In [4]:
# load rotate model
rotate_model = pykeen.models.RotatE(
    triples_factory=dataset.training,
    embedding_dim=250,  # Note this is half the size of the actual embedding dim listed because rotate doubles the embedding dim
    random_seed=711022683,  # use same seed as training otherwise model and chkpt train/test split will be different
)

# load chkpt
rotate_chkpt = torch.load(
    pykeen.constants.PYKEEN_CHECKPOINTS.joinpath("RotatE_WN18RR.pt"),
)
# attach state to model
rotate_model.load_state_dict(transe_chkpt["model_state_dict"])

  rotate_chkpt = torch.load(


<All keys matched successfully>

### Load ComplEx model

In [5]:
# load ComplEx model
complex_model = pykeen.models.ComplEx(
    triples_factory=dataset.training,
    embedding_dim=500,
    random_seed=374523484,  # use same seed as training otherwise model and chkpt train/test split will be different
    regularizer_kwargs=dict(weight=0.000005, p=3),
)

# load chkpt
complex_chkpt = torch.load(
    pykeen.constants.PYKEEN_CHECKPOINTS.joinpath("ComplEx_WN18RR.pt"),
)
# attach state to model
complex_model.load_state_dict(complex_chkpt["model_state_dict"])

  complex_chkpt = torch.load(


<All keys matched successfully>

### Load DistMult model

In [6]:
# load distmult model
distmult_model = pykeen.models.DistMult(
    triples_factory=dataset.training,
    embedding_dim=1000,
    random_seed=2171371192,  # use same seed as training otherwise model and chkpt train/test split will be different
    regularizer_kwargs=dict(weight=0.000005, p=3),
)

# load chkpt
distmult_chkpt = torch.load(
    pykeen.constants.PYKEEN_CHECKPOINTS.joinpath("DistMult_WN18RR.pt"),
)
# attach state to model
distmult_model.load_state_dict(distmult_chkpt["model_state_dict"])

  distmult_chkpt = torch.load(


<All keys matched successfully>

## Evaluate model on a fixed test set

### how many nodes and relations are there that we can sample against?

In [7]:
f"Number of Nodes: {dataset.num_entities:,}"

'Number of Nodes: 40,559'

In [8]:
f"Number of Relations: {dataset.num_relations:,}"

'Number of Relations: 11'

In [9]:
# dataframe of testing triples and their correct answers
dataset.testing.tensor_to_df(dataset.testing.mapped_triples)

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label
0,13,3826,3,_hypernym,0,1740
1,17,4475,3,_hypernym,16,4258
2,24,6238,3,_hypernym,612,104868
3,29,6802,3,_hypernym,30,7012
4,31,7328,1,_derivationally_related_form,32248,10803193
...,...,...,...,...,...,...
2919,40477,15256714,3,_hypernym,22050,5867413
2920,40516,15274695,1,_derivationally_related_form,4244,779360
2921,40518,15275598,3,_hypernym,40509,15272029
2922,40521,15278281,3,_hypernym,40533,15286249


### Get test triples (at least a small set of it)

In [10]:
# how the tensor looks
dataset.testing.mapped_triples

tensor([[   13,     3,     0],
        [   17,     3,    16],
        [   24,     3,   612],
        ...,
        [40518,     3, 40509],
        [40521,     3, 40533],
        [40552,     9, 22456]])

In [11]:
# size of the tensor
dataset.testing.mapped_triples.shape

torch.Size([2924, 3])

In [12]:
# get 1000 random indices to slice the testing triples
random_ind = np.random.choice(
    np.array(range(0, dataset.testing.mapped_triples.shape[0])), 1000, replace=False
)

In [13]:
# sliced tensor of testing triples
dataset.testing.mapped_triples[random_ind].shape

torch.Size([1000, 3])

In [14]:
# list of lists of triples
# [[head, relation, tail], ...]
test_set = dataset.testing.mapped_triples[random_ind].tolist()

### get top 1000 predictions in our random sample.
* May or may not be exactly 1000 unique entity/relation combinations

In [15]:
# write afunction to make predictions on the models
def get_top_tail_predictions(model, test_set, dataset, k=None) -> pd.DataFrame:
    """
    Given a model, and a test set, return the top predictions for the test set

    model: pykeen.models.Model
    test_set: list of lists of triples [[head, relation, tail], ...]
    dataset: pykeen.datasets.dataset.TriplesFactory
    k: int, number of top predictions to return

    returns: pd.DataFrame
    """
    # create dictionaries for entities and relations
    id2entity = {v: k for k, v in dataset.entity_to_id.items()}
    id2relation = {v: k for k, v in dataset.relation_to_id.items()}

    # get the top predictions for the first entry in the test set
    res_ls = []
    for i in test_set:
        # generate predictions and cast to a polars dataframe
        polars_df = pl.DataFrame(
            pykeen.predict.predict_target(
                model=model, triples_factory=dataset, head=i[0], relation=i[1]
            ).df
        )
        # sort the predictions by score, add head and relation ids
        polars_df = polars_df.with_columns(
            pl.col("score").sort(descending=True),
            head_id=i[0],  # assign head_id
            rel_id=i[1],  # assign relation_id
        )
        res_ls.append(polars_df)

    # rename entities in head/tail/relation from ids to actual names
    # collapse tail_ids to a single row based on head and relation_ids
    res_df = (
        pl.concat(res_ls)
        .with_columns(
            # rename entities in head/tail/relation from ids to actual names
            pl.col("tail_id").cast(pl.String).replace(id2entity),
            pl.col("head_id").cast(pl.String).replace(id2entity),
            pl.col("rel_id").cast(pl.String).replace(id2relation),
        )
        .unique(["head_id", "rel_id", "tail_id"])
        .group_by(["head_id", "rel_id"])
        .agg("tail_id", maintain_order=True)
    )

    # return top k predictions
    if k > 0:
        res_df = res_df.with_columns(pl.col("tail_id").list.head(k))

    return res_df

### make predictions and export

In [16]:
transe_df = get_top_tail_predictions(transe_model, test_set, dataset, k=1000)

In [17]:
distumult_df = get_top_tail_predictions(distmult_model, test_set, dataset, k=1000)

In [18]:
complex_df = get_top_tail_predictions(complex_model, test_set, dataset, k=1000)

In [19]:
rotate_df = get_top_tail_predictions(rotate_model, test_set, dataset, k=1000)

#### add column name identifier to each dataframe.
* then stack them!

In [20]:
transe_df = transe_df.with_columns(model=pl.lit("TransE"))
distumult_df = distumult_df.with_columns(model=pl.lit("DistMult"))
complex_df = complex_df.with_columns(model=pl.lit("ComplEx"))
rotate_df = rotate_df.with_columns(model=pl.lit("RotatE"))

# combine the results
combined_df = pl.concat([transe_df, distumult_df, complex_df, rotate_df])

In [21]:
combined_df.head()

head_id,rel_id,tail_id,maintain_order,model
str,str,list[str],bool,str
"""1575401""","""_hypernym""","[""24264"", ""426928"", … ""6805297""]",True,"""TransE"""
"""12723610""","""_hypernym""","[""14034177"", ""7480068"", … ""14440875""]",True,"""TransE"""
"""508952""","""_hypernym""","[""2210855"", ""4623612"", … ""3196990""]",True,"""TransE"""
"""9767197""","""_derivationall…","[""5020358"", ""1225461"", … ""7255027""]",True,"""TransE"""
"""10826352""","""_instance_hype…","[""8177958"", ""730984"", … ""1749320""]",True,"""TransE"""


In [22]:
combined_df.shape

(3924, 5)

In [23]:
assert (
    combined_df.shape[0] / 4 == combined_df.unique(["head_id", "rel_id"]).shape[0]
), "Some predictions are not made between all algorithms"

#### export the sample list as well as the parquet

In [24]:
with open(
    "/home/rogertu/projects/semmed/semmed/data/benchmark_data/WN18RR_1000_sampled_test.pkl",
    "wb",
) as f:
    pickle.dump(combined_df, f)

In [25]:
combined_df.write_parquet(
    "/home/rogertu/projects/semmed/semmed/data/benchmark_data/WN18RR_1000_sampled_test_predictions.parquet"
)