In [1]:
! pip install pykeen
! pip install transformers
! pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pykeen
  Downloading pykeen-1.10.1-py3-none-any.whl (739 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m739.3/739.3 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from pykeen)
  Downloading dataclasses_json-0.5.8-py3-none-any.whl (26 kB)
Collecting click-default-group (from pykeen)
  Downloading click-default-group-1.2.2.tar.gz (3.3 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting optuna>=2.0.0 (from pykeen)
  Downloading optuna-3.2.0-py3-none-any.whl (390 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m390.6/390.6 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
Collecting more-click (from pykeen)
  Downloading more_click-0.1.2-py3-none-any.whl (6.7 kB)
Collecting pystow>=0.4.3 (from pykeen)
  Downloading pystow-0.5.0-py3-none-any.whl (37 kB)
Collecting docdata (from pykeen)
  Downloading 

## Env variables

In [2]:
BASE_URL = "https://raw.githubusercontent.com/GenetAsefa/LiterallyWikidata/main/Datasets/"
TRIPLES_VALID_URL = f"{BASE_URL}/LitWD48K/valid.txt"
TRIPLES_TEST_URL = f"{BASE_URL}/LitWD48K/test.txt"
TRIPLES_TRAIN_URL = f"{BASE_URL}/LitWD48K/train.txt"
LITERALS_URL = f"{BASE_URL}/LitWD48K/numeric_literals.txt"

ATTRIBUTES_LABELS_URL = f"{BASE_URL}/Attributes/attribute_labels_en.txt"
ENTITIES_LABELS_URL = f"{BASE_URL}/Entities/entity_labels_en.txt"
RELATIONS_LABELS_URL = f"{BASE_URL}/Relations/relation_labels_en.txt"

VALID_PATH = './valid.txt'
TEST_PATH = './test.txt'
TRAIN_PATH = './train.txt'
INITIAL_LITERALS_PATH = './init_literals.txt'
LITERALS_PATH = './literals.txt'
LABELS_PATH = './labels.txt'

## Load data

In [3]:
import requests

def save_to_file(url, path):
    content = ""
    response = requests.get(url)
    if response.status_code == 200:
        content += response.text
    else:
        print(f"Failed to fetch file from URL: {url}")

    with open(path, 'w') as fp:
      fp.write(content)

save_to_file(TRIPLES_VALID_URL, VALID_PATH)
save_to_file(TRIPLES_TEST_URL, TEST_PATH)
save_to_file(TRIPLES_TRAIN_URL, TRAIN_PATH)
save_to_file(LITERALS_URL, INITIAL_LITERALS_PATH)

def merge_labels_files(urls):
    labels = ""
    for url in urls:
        response = requests.get(url)
        if response.status_code == 200:
            labels += response.text
        else:
            print(f"Failed to fetch file from URL: {url}")

    return labels

file_urls = [
    ATTRIBUTES_LABELS_URL,
    ENTITIES_LABELS_URL,
    RELATIONS_LABELS_URL
]

labels = merge_labels_files(file_urls)

with open(LABELS_PATH, 'w') as fp:
    fp.write(labels)

## Preprocess literals

In [4]:
import pandas as pd
from datetime import datetime, timezone

def convert_literals_to_float():
    literals = pd.read_csv(INITIAL_LITERALS_PATH, sep="\t", names = ['id', 'relation', 'value'])
    literals = literals[~literals['value'].str.contains("<http://www.w3.org/2001/XMLSchema#dateTime>")]
    literals.value = [value.split("^^")[0] for value in literals.value]
    return literals

converted_literals = convert_literals_to_float()
converted_literals.to_csv(LITERALS_PATH, sep='\t', header=False, index=False)

## Load dataset

In [5]:
from pykeen.datasets.literal_base import NumericPathDataset

class LitWD48K(NumericPathDataset):
    def __init__(self, **kwargs):
        super().__init__(
            training_path=TRAIN_PATH,
            testing_path=TEST_PATH,
            validation_path=VALID_PATH,
            literals_path=LITERALS_PATH,
            **kwargs,
        )

dataset = LitWD48K()

INFO:pykeen.utils:Using opt_einsum


## Model, Optimizer and Training Definition

In [6]:
# Pick a model
from pykeen.models import ComplExLiteral

training_triples_factory=dataset.training
model = ComplExLiteral(triples_factory=training_triples_factory)

# Pick an optimizer from Torch
from torch.optim import Adam
optimizer = Adam(params=model.get_grad_params())

# Pick a training approach (sLCWA or LCWA)
from pykeen.training import SLCWATrainingLoop
training_loop = SLCWATrainingLoop(
    model=model,
    triples_factory=training_triples_factory,
    optimizer=optimizer,
)



## Training

In [7]:
_ = training_loop.train(
    triples_factory=training_triples_factory,
    num_epochs=5,
    batch_size=256,
)

Training epochs on cpu:   0%|          | 0/5 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/1185 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1185 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1185 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1185 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/1185 [00:00<?, ?batch/s]

## Evaluation

In [None]:
# Pick an evaluator
from pykeen.evaluation import RankBasedEvaluator
evaluator = RankBasedEvaluator()

# Get triples to test
mapped_triples = dataset.testing.mapped_triples

# Evaluate
results = evaluator.evaluate(
    model=model,
    mapped_triples=mapped_triples,
    batch_size=1024,
    additional_filter_triples=[
        dataset.training.mapped_triples,
        dataset.validation.mapped_triples,
    ],
)
# print(results)

## Load labels

In [8]:
import pandas as pd

def load_labels(url):
  return pd.read_csv(url, sep="\t", names = ['id', 'label'])

ATTRIBUTES_LABELS_URL = f"{BASE_URL}/Attributes/attribute_labels_en.txt"
ENTITIES_LABELS_URL = f"{BASE_URL}/Entities/entity_labels_en.txt"
RELATIONS_LABELS_URL = f"{BASE_URL}/Relations/relation_labels_en.txt"

attributes_labels = load_labels(ATTRIBUTES_LABELS_URL)
entities_labels = load_labels(ENTITIES_LABELS_URL)
relations_labels = load_labels(RELATIONS_LABELS_URL)

labels = pd.concat([attributes_labels, entities_labels, relations_labels])


def get_label_for_id(id):
    return labels[labels["id"] == id]["label"].values[0]

## Generate predictions

In [None]:
import torch
from pykeen.models import Model
from pykeen import predict

from pykeen.datasets import get_dataset
from pykeen.predict import predict_target

# Create a mapping for relations
relation_label_to_id = dataset.training.relation_to_id

# Find the relation id for "named after"
named_after_id = relation_label_to_id["P138"]

# Get the relation label
named_after_label = dataset.training.relation_id_to_label[named_after_id]

# Prepare a dictionary to store predictions
predictions = {}

# Loop over all entities
numOfAllEntities = model.num_entities
i = 1
for head_id, head_label in dataset.training.entity_id_to_label.items():
    # Use `predict_target` to predict the tail entity
    prediction_df = predict.predict_target(
        model=model,
        head=head_label,
        relation=named_after_label,
        triples_factory=dataset.training,
    ).df

    head_id_label = get_label_for_id(head_label)

    f = open("predicted.txt", "a")
    line = "Predictions for " + head_label + " with label " + head_id_label + ":\n"
    f.write(line)
    f.close()

    predicted = prediction_df.head(5)
    for index, row in predicted.iterrows():
        score = row['score']
        tail = row['tail_label']
        tail_id_label = get_label_for_id(tail)

        f = open("predictions.csv", "a")
        line = "Predicted tail " + tail + " with score " + str(score) + " with tail label " + tail_id_label + "\n"
        f.write(line)
        f.close()

    f = open("predictions.csv", "a")
    f.write("\n\n")
    f.close()

    # Store the prediction data frame
    predictions[head_label] = prediction_df.head(5)
    if (i % 100) == 0:
        print(f'Generated predictions for entity {i} out of {numOfAllEntities}')
    i += 1
