## Load data

In [2]:
from pykeen.datasets import WD50KT

dataset = WD50KT()

## Load labels

In [None]:
import pickle

with open('labels.pkl', 'rb') as f:
    labels = pickle.load(f)

## Get embeddings for entity labels

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

my_literal_information = []

batch_size = 50
for i in range(0, len(labels), batch_size):
    batch_labels = labels[i:i+batch_size]

    # Tokenize labels and convert to input IDs
    inputs = tokenizer(batch_labels, return_tensors='pt', padding=True, truncation=True, max_length=128)

    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state

    # Take the mean of the embeddings to get a single vector for each label
    batch_literal_information = embeddings.mean(dim=1)

    my_literal_information.append(batch_literal_information)

# Concatenate all batches
my_literal_information = torch.cat(my_literal_information)


## Train model

In [None]:
from pykeen.pipeline import pipeline

result = pipeline(
    dataset=dataset,
    model='ComplExLiteral',
    training_kwargs=dict(num_epochs=250),
    evaluation_relation_whitelist={"P138"}

)

In [None]:
result.save_to_directory('doctests/test_unstratified_complex')

## Generate predictions

In [None]:
import torch
from pykeen.models import Model
from pykeen import predict

from pykeen.datasets import get_dataset
from pykeen.predict import predict_target

# Get the trained model
model = result.model

# Create a mapping for relations
relation_label_to_id = result.training.relation_to_id

# Find the relation id for "named after"
named_after_id = relation_label_to_id["P138"]

# use conferences relations for the Nations dataset, remove this when used for named after and Wikidata
# named_after_id = relation_label_to_id["conferences"]

# Get the relation label
named_after_label = result.training.relation_id_to_label[named_after_id]

# Prepare a dictionary to store predictions
predictions = {}

# Loop over all entities
numOfAllEntities = model.num_entities
i = 1
for head_id, head_label in result.training.entity_id_to_label.items():
    # Use `predict_target` to predict the tail entity
    prediction_df = predict.predict_target(
        model=model,
        head=head_label,
        relation=named_after_label,
        triples_factory=result.training,
    ).df

    id = head_label

    # Create parameters
    params = {
                'action': 'wbgetentities',
                'ids':id,
                'format': 'json',
                'languages': 'en'
            }

    # fetch the API
    data = fetch_wikidata(params)

    # Show response
    data = data.json()
    head_id_label = data['entities'][id]['labels']['en']['value']

    f = open("predicted.txt", "a")
    line = "Predictions for " + head_label + " with label " + head_id_label + ":\n"
    f.write(line)
    f.close()

    predicted = prediction_df.head(5)
    for index, row in predicted.iterrows():
        score = row['score']
        tail = row['tail_label']
        params = {
              'action': 'wbgetentities',
              'ids':tail,
              'format': 'json',
              'languages': 'en'
          }
        tailData = fetch_wikidata(params)

        data = tailData.json()
        tail_id_label = data['entities'][tail]['labels']['en']['value']
        f = open("gosho.csv", "a")
        line = "Predicted tail " + tail + " with score " + str(score) + " with tail label " + tail_id_label + "\n"
        f.write(line)
        f.close()

    f = open("gosho.csv", "a")
    f.write("\n\n")
    f.close()

    # Store the prediction data frame
    predictions[head_label] = prediction_df.head(5)
    if (i % 100) == 0:
        print(f'Generated predictions for entity {i} out of {numOfAllEntities}')
    i += 1
