# Generate predictions for FB15k
Generate predictions for FB15k and store the as a collated dataframe for a set of given predictions.

In [None]:
import pykeen
import pykeen.datasets
import pykeen.models
import pykeen.predict
import pandas as pd
import numpy as np
import pickle
import os
import polars as pl
import torch

## Load in each model

### Load dataset

In [2]:
dataset = pykeen.datasets.get_dataset(dataset="FB15k")

  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  data = dict(torch.load(path.joinpath(cls.base_file_name)))
  metadata = torch.load(metadata_path) if metadata_path.is_file() else None


### Load TransE model

In [3]:
# load transe model
transe_model = pykeen.models.TransE(
    triples_factory=dataset.training,
    embedding_dim=1000,
    scoring_fct_norm=2,
    random_seed=2747481262,  # use same seed as training otherwise model and chkpt train/test split will be different
)

# load chkpt
transe_chkpt = torch.load(
    pykeen.constants.PYKEEN_CHECKPOINTS.joinpath(
        "TransE_FB15k_2.pt"
    ),  # accidentally deleted original. rm is sometimes dangerous
)
# attach state to model
transe_model.load_state_dict(transe_chkpt["model_state_dict"])

  transe_chkpt = torch.load(


<All keys matched successfully>

### Load RotatE model

In [4]:
# load rotate model
rotate_model = pykeen.models.RotatE(
    triples_factory=dataset.training,
    embedding_dim=500,  # Note this is half the size of the actual embedding dim listed because rotate doubles the embedding dim
    random_seed=4055375379,  # use same seed as training otherwise model and chkpt train/test split will be different
)

# load chkpt
rotate_chkpt = torch.load(
    pykeen.constants.PYKEEN_CHECKPOINTS.joinpath("RotatE_FB15k.pt"),
)
# attach state to model
rotate_model.load_state_dict(transe_chkpt["model_state_dict"])

  rotate_chkpt = torch.load(


<All keys matched successfully>

### Load ComplEx model

In [5]:
# load rotate model
complex_model = pykeen.models.ComplEx(
    triples_factory=dataset.training,
    embedding_dim=1000,
    random_seed=1518493774,  # use same seed as training otherwise model and chkpt train/test split will be different
    regularizer_kwargs=dict(weight=0.000002, p=3),
)

# load chkpt
complex_chkpt = torch.load(
    pykeen.constants.PYKEEN_CHECKPOINTS.joinpath("ComplEx_FB15k.pt"),
)
# attach state to model
complex_model.load_state_dict(complex_chkpt["model_state_dict"])

  complex_chkpt = torch.load(


<All keys matched successfully>

### Load DistMult model

In [6]:
# load rotate model
distmult_model = pykeen.models.DistMult(
    triples_factory=dataset.training,
    embedding_dim=2000,
    random_seed=1373867215,  # use same seed as training otherwise model and chkpt train/test split will be different
    regularizer_kwargs=dict(weight=0.000002, p=3),
)

# load chkpt
distmult_chkpt = torch.load(
    pykeen.constants.PYKEEN_CHECKPOINTS.joinpath("DistMult_FB15k_1.pt"),
)
# attach state to model
distmult_model.load_state_dict(distmult_chkpt["model_state_dict"])

  distmult_chkpt = torch.load(


<All keys matched successfully>

## Evaluate model on a fixed test set

### how many nodes and relations are there that we can sample against?

In [151]:
f"Number of Nodes: {dataset.num_entities:,}"

'Number of Nodes: 14,951'

In [152]:
f"Number of Relations: {dataset.num_relations:,}"

'Number of Relations: 1,345'

In [None]:
# dataframe of testing triples and their correct answers
dataset.testing.tensor_to_df(dataset.testing.mapped_triples)

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label
0,0,/m/010016,797,/location/hud_foreclosure_area/estimated_numbe...,13352,/m/0jbk9
1,1,/m/0100mt,707,/government/governmental_jurisdiction/governin...,14425,/m/0pqc5
2,1,/m/0100mt,799,/location/hud_foreclosure_area/hhuniv./measure...,13352,/m/0jbk9
3,1,/m/0100mt,800,/location/hud_foreclosure_area/total_90_day_va...,13352,/m/0jbk9
4,1,/m/0100mt,810,/location/location/people_born_here,2129,/m/01mmslz
...,...,...,...,...,...,...
59066,14944,/m/0zpfy,416,/common/topic/webpage./common/webpage/category,10505,/m/08mbj5d
59067,14945,/m/0zq7r,416,/common/topic/webpage./common/webpage/category,10505,/m/08mbj5d
59068,14947,/m/0zqq8,797,/location/hud_foreclosure_area/estimated_numbe...,13352,/m/0jbk9
59069,14947,/m/0zqq8,804,/location/location/containedby,14112,/m/0mww2


### Get test triples (at least a small set of it)

In [None]:
# how the tensor looks
dataset.testing.mapped_triples

tensor([[    0,   797, 13352],
        [    1,   707, 14425],
        [    1,   799, 13352],
        ...,
        [14947,   797, 13352],
        [14947,   804, 14112],
        [14947,   813,  4500]])

In [None]:
# size of the tensor
dataset.testing.mapped_triples.shape

torch.Size([59071, 3])

In [43]:
# get 1000 random indices to slice the testing triples
random_ind = np.random.choice(
    np.array(range(0, dataset.testing.mapped_triples.shape[0])), 1000, replace=False
)

In [None]:
# sliced tensor of testing triples
dataset.testing.mapped_triples[random_ind].shape

torch.Size([1000, 3])

In [46]:
# list of lists of triples
# [[head, relation, tail], ...]
test_set = dataset.testing.mapped_triples[random_ind].tolist()

### get top 1000 predictions in our random sample.
* May or may not be exactly 1000 unique entity/relation combinations

#### first generate a prediction for a given triple

In [110]:
# get the top predictions for the first entry in the test set
adf = pykeen.predict.predict_target(
    model=transe_model,
    triples_factory=dataset,
    head=test_set[0][0],
    relation=test_set[0][1],
)

In [87]:
# translate the entity ids to their actual names
adf.df["tail_id"] = adf.df["tail_id"].apply(
    lambda x: {v: k for k, v in adf.factory.entity_to_id.items()}[x]
)

In [89]:
adf.df.head()

Unnamed: 0,tail_id,score
8648,/m/05ls3r,-23.525023
13214,/m/0hmt3,-23.545902
13454,/m/0jnl5,-23.549711
7751,/m/04l5d0,-23.551441
7384,/m/048ldh,-23.552189


In [95]:
adf.df.shape

(14951, 4)

#### can we implement it in polars

In [111]:
polars_df = pl.DataFrame(adf.df)

In [112]:
polars_df.head()

tail_id,score
i64,f64
8648,-23.525023
13214,-23.545902
13454,-23.549711
7751,-23.551441
7384,-23.552189


In [None]:
# create dictionary of entities and relations
id2entity = {str(v): k for k, v in adf.factory.entity_to_id.items()}
id2relation = {str(v): k for k, v in adf.factory.relation_to_id.items()}

In [146]:
# polars function to replace the ids with their actual names and aggregate results as a list
# 25 s to do the groupby and replace

res_ls = []
for i in test_set:
    polars_df = polars_df.with_columns(
        pl.col("score").sort(descending=True), head_id=i[0], rel_id=i[1]
    ).with_columns(
        pl.col("tail_id").cast(pl.String).replace(id2entity),
        pl.col("head_id").cast(pl.String).replace(id2entity),
        pl.col("rel_id").cast(pl.String).replace(id2relation),
    )

    res_ls.append(polars_df)

In [None]:
# sample of the results. its 2-3x faster than pandas
pl.concat(res_ls).unique(["head_id", "rel_id", "tail_id"]).group_by(
    ["head_id", "rel_id"]
).agg("tail_id", maintain_order=True)

head_id,rel_id,tail_id,maintain_order
str,str,list[str],bool
"""/m/04pg29""","""/tv/tv_program…","[""/m/0356gk"", ""/m/096cw_"", … ""/m/02vl_pz""]",true
"""/m/07t3gd""","""/business/job_…","[""/m/01j95f"", ""/m/03l7tr"", … ""/m/03vtbc""]",true
"""/m/02rdyk7""","""/award/award_c…","[""/m/01kkg5"", ""/m/0122wc"", … ""/m/0bgv4g""]",true
"""/m/05zksls""","""/award/award_c…","[""/m/01k9cc"", ""/m/03lpp_"", … ""/m/03h42s4""]",true
"""/m/05k7sb""","""/location/loca…","[""/m/03j0ss"", ""/m/03__77"", … ""/m/0fp_xp""]",true
…,…,…,…
"""/m/01t265""","""/people/person…","[""/m/09hldj"", ""/m/06jd89"", … ""/m/080dyk""]",true
"""/m/0178_w""","""/music/musical…","[""/m/044l47"", ""/m/01slcv"", … ""/m/02cg2v""]",true
"""/m/09p4w8""","""/film/film/sta…","[""/m/0303jw"", ""/m/02rh_0"", … ""/m/02vx4""]",true
"""/m/054lpb6""","""/film/film_dis…","[""/m/03v9yw"", ""/m/01bdxz"", … ""/m/05b3ts""]",true


#### generate predictions on a bigger scale

In [None]:
# write afunction to make predictions on the models
def get_top_tail_predictions(model, test_set, dataset, k=None) -> pd.DataFrame:
    """
    Given a model, and a test set, return the top predictions for the test set

    model: pykeen.models.Model
    test_set: list of lists of triples [[head, relation, tail], ...]
    dataset: pykeen.datasets.dataset.TriplesFactory
    k: int, number of top predictions to return

    returns: pd.DataFrame
    """
    # create dictionaries for entities and relations
    id2entity = {v: k for k, v in dataset.entity_to_id.items()}
    id2relation = {v: k for k, v in dataset.relation_to_id.items()}

    # get the top predictions for the first entry in the test set
    res_ls = []
    for i in test_set:
        # generate predictions and cast to a polars dataframe
        polars_df = pl.DataFrame(
            pykeen.predict.predict_target(
                model=model, triples_factory=dataset, head=i[0], relation=i[1]
            ).df
        )
        # sort the predictions by score, add head and relation ids
        polars_df = polars_df.with_columns(
            pl.col("score").sort(descending=True),
            head_id=i[0],  # assign head_id
            rel_id=i[1],  # assign relation_id
        )
        res_ls.append(polars_df)

    # rename entities in head/tail/relation from ids to actual names
    # collapse tail_ids to a single row based on head and relation_ids
    res_df = (
        pl.concat(res_ls)
        .with_columns(
            # rename entities in head/tail/relation from ids to actual names
            pl.col("tail_id").cast(pl.String).replace(id2entity),
            pl.col("head_id").cast(pl.String).replace(id2entity),
            pl.col("rel_id").cast(pl.String).replace(id2relation),
        )
        .unique(["head_id", "rel_id", "tail_id"])
        .group_by(["head_id", "rel_id"])
        .agg("tail_id", maintain_order=True)
    )

    # return top k predictions
    if k > 0:
        res_df = res_df.with_columns(pl.col("tail_id").list.head(k))

    return res_df

### make predictions and export

In [169]:
transe_df = get_top_tail_predictions(transe_model, test_set, dataset, k=1000)

In [162]:
distumult_df = get_top_tail_predictions(distmult_model, test_set, dataset, k=1000)

In [163]:
complex_df = get_top_tail_predictions(complex_model, test_set, dataset, k=1000)

In [164]:
rotate_df = get_top_tail_predictions(rotate_model, test_set, dataset, k=1000)

#### add column name identifier to each dataframe.
* then stack them!

In [173]:
transe_df = transe_df.with_columns(model=pl.lit("TransE"))
distumult_df = distumult_df.with_columns(model=pl.lit("DistMult"))
complex_df = complex_df.with_columns(model=pl.lit("ComplEx"))
rotate_df = rotate_df.with_columns(model=pl.lit("RotatE"))

# combine the results
combined_df = pl.concat([transe_df, distumult_df, complex_df, rotate_df])

In [174]:
combined_df.head()

head_id,rel_id,tail_id,maintain_order,model
str,str,list[str],bool,str
"""/m/089g0h""","""/film/film_job…","[""/m/03h4fq7"", ""/m/09hy79"", … ""/m/0l2lk""]",True,"""TransE"""
"""/m/01whg97""","""/people/person…","[""/m/0f2v0"", ""/m/0dc95"", … ""/m/0fgsq2""]",True,"""TransE"""
"""/m/0jm3b""","""/sports/profes…","[""/m/0mbwf"", ""/m/01hr11"", … ""/m/0mhl6""]",True,"""TransE"""
"""/m/03h304l""","""/film/producer…","[""/m/02sfnv"", ""/m/057__d"", … ""/m/01gc7h""]",True,"""TransE"""
"""/m/037jz""","""/influence/inf…","[""/m/073bb"", ""/m/059y0"", … ""/m/0br1x_""]",True,"""TransE"""


In [175]:
combined_df.shape

(3796, 5)

In [None]:
assert (
    combined_df.shape[0] / 4 == combined_df.unique(["head_id", "rel_id"]).shape[0]
), "Some predictions are not made between all algorithms"

#### export the sample list as well as the parquet

In [180]:
with open(
    "/home/rogertu/projects/semmed/semmed/data/benchmark_data/FB15k_1000_sampled_test.pkl",
    "wb",
) as f:
    pickle.dump(combined_df, f)

In [182]:
combined_df.write_parquet(
    "/home/rogertu/projects/semmed/semmed/data/benchmark_data/FB15k_1000_sampled_test_predictions.parquet"
)