In [None]:
! pip install pykeen
! pip install transformers
! pip install torch

## Env variables

In [2]:
BASE_URL = "https://raw.githubusercontent.com/GenetAsefa/LiterallyWikidata/main/Datasets/"
TRIPLES_VALID_URL = f"{BASE_URL}/LitWD48K/valid.txt"
TRIPLES_TEST_URL = f"{BASE_URL}/LitWD48K/test.txt"
TRIPLES_TRAIN_URL = f"{BASE_URL}/LitWD48K/train.txt"
LITERALS_URL = f"{BASE_URL}/LitWD48K/numeric_literals.txt"

ATTRIBUTES_LABELS_URL = f"{BASE_URL}/Attributes/attribute_labels_en.txt"
ENTITIES_LABELS_URL = f"{BASE_URL}/Entities/entity_labels_en.txt"
RELATIONS_LABELS_URL = f"{BASE_URL}/Relations/relation_labels_en.txt"

VALID_PATH = './valid.txt'
TEST_PATH = './test.txt'
TRAIN_PATH = './train.txt'
INITIAL_LITERALS_PATH = './init_literals.txt'
LITERALS_PATH = './literals.txt'
LABELS_PATH = './labels.txt'

## Load data

In [3]:
import requests

def save_to_file(url, path):
    content = ""
    response = requests.get(url)
    if response.status_code == 200:
        content += response.text
    else:
        print(f"Failed to fetch file from URL: {url}")

    with open(path, 'w') as fp:
      fp.write(content)

save_to_file(TRIPLES_VALID_URL, VALID_PATH)
save_to_file(TRIPLES_TEST_URL, TEST_PATH)
save_to_file(TRIPLES_TRAIN_URL, TRAIN_PATH)
save_to_file(LITERALS_URL, INITIAL_LITERALS_PATH)

def merge_labels_files(urls):
    labels = ""
    for url in urls:
        response = requests.get(url)
        if response.status_code == 200:
            labels += response.text
        else:
            print(f"Failed to fetch file from URL: {url}")

    return labels

file_urls = [
    ATTRIBUTES_LABELS_URL,
    ENTITIES_LABELS_URL,
    RELATIONS_LABELS_URL
]

labels = merge_labels_files(file_urls)

with open(LABELS_PATH, 'w') as fp:
    fp.write(labels)

## Preprocess literals

In [4]:
import pandas as pd
from datetime import datetime, timezone

def convert_literals_to_float():
    literals = pd.read_csv(INITIAL_LITERALS_PATH, sep="\t", names = ['id', 'relation', 'value'])
    literals = literals[~literals['value'].str.contains("<http://www.w3.org/2001/XMLSchema#dateTime>")]
    literals.value = [value.split("^^")[0] for value in literals.value]
    return literals

converted_literals = convert_literals_to_float()
converted_literals.to_csv(LITERALS_PATH, sep='\t', header=False, index=False)

## Load dataset

In [5]:
from pykeen.datasets.literal_base import NumericPathDataset

class LitWD48K(NumericPathDataset):
    def __init__(self, **kwargs):
        super().__init__(
            training_path=TRAIN_PATH,
            testing_path=TEST_PATH,
            validation_path=VALID_PATH,
            literals_path=LITERALS_PATH,
            **kwargs,
        )

dataset = LitWD48K()

INFO:pykeen.utils:Using opt_einsum


## Model, Optimizer and Training Definition

In [6]:
# Pick a model
from pykeen.models import ComplExLiteral

training_triples_factory=dataset.training
model = ComplExLiteral(triples_factory=training_triples_factory)

# Pick an optimizer from Torch
from torch.optim import Adam
optimizer = Adam(params=model.get_grad_params())

# Pick a training approach (sLCWA or LCWA)
from pykeen.training import SLCWATrainingLoop
training_loop = SLCWATrainingLoop(
    model=model,
    triples_factory=training_triples_factory,
    optimizer=optimizer,
)



## Training

In [None]:
_ = training_loop.train(
    triples_factory=training_triples_factory,
    num_epochs=5,
    batch_size=256,
)

Training epochs on cpu:   0%|          | 0/5 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/1185 [00:00<?, ?batch/s]

## Evaluation

In [None]:
# Pick an evaluator
from pykeen.evaluation import RankBasedEvaluator
evaluator = RankBasedEvaluator()

# Get triples to test
mapped_triples = dataset.testing.mapped_triples

# Evaluate
results = evaluator.evaluate(
    model=model,
    mapped_triples=mapped_triples,
    batch_size=1024,
    additional_filter_triples=[
        dataset.training.mapped_triples,
        dataset.validation.mapped_triples,
    ],
)
# print(results)

## Load labels

In [35]:
import pandas as pd

def load_labels(url):
  labels = pd.read_csv(url, sep="\t", names = ['id', 'label']).label
  return [label for label in labels if str(label) != 'nan']

ATTRIBUTES_LABELS_URL = f"{BASE_URL}/Attributes/attribute_labels_en.txt"
ENTITIES_LABELS_URL = f"{BASE_URL}/Entities/entity_labels_en.txt"
RELATIONS_LABELS_URL = f"{BASE_URL}/Relations/relation_labels_en.txt"

attributes_labels = load_labels(ATTRIBUTES_LABELS_URL)
entities_labels = load_labels(ENTITIES_LABELS_URL)
relations_labels = load_labels(RELATIONS_LABELS_URL)

labels = attributes_labels + entities_labels + relations_labels

## Get embeddings for entity labels

In [38]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

my_literal_information = []

batch_size = 50
for i in range(0, len(labels), batch_size):
    batch_labels = labels[i:i+batch_size]

    # Tokenize labels and convert to input IDs
    inputs = tokenizer(batch_labels, return_tensors='pt', padding=True, truncation=True, max_length=128)

    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state

    # Take the mean of the embeddings to get a single vector for each label
    batch_literal_information = embeddings.mean(dim=1)

    my_literal_information.append(batch_literal_information)

# Concatenate all batches
my_literal_information = torch.cat(my_literal_information)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
my_literal_information

tensor([[-0.3876, -0.1854, -0.1416,  ..., -0.1650,  0.0916,  0.0933],
        [-0.2391,  0.0040,  0.0494,  ...,  0.1673,  0.7115, -0.0766],
        [-0.5455, -0.3217, -0.0490,  ...,  0.1380, -0.1209, -0.1049],
        ...,
        [-0.0571, -0.0751,  0.0626,  ..., -0.2264,  0.2386,  0.1435],
        [ 0.0218, -0.1775,  0.0632,  ...,  0.0607,  0.1923, -0.1187],
        [ 0.4200,  0.3576, -0.3262,  ..., -0.3475, -0.2278,  0.2865]])

## Train model

In [67]:
from pykeen.pipeline import pipeline

result = pipeline(
    dataset=dataset,
    model='ComplEx',
    training_kwargs=dict(num_epochs=250),
    evaluation_relation_whitelist={"P138"}

)

INFO:pykeen.triples.triples_factory:keeping 1/257 (0.39%) relations.
INFO:pykeen.triples.triples_factory:keeping 82/16838 (0.49%) triples.
INFO:pykeen.triples.triples_factory:keeping 1/257 (0.39%) relations.
INFO:pykeen.triples.triples_factory:keeping 81/16838 (0.48%) triples.
INFO:pykeen.pipeline.api:Using device: None


Training epochs on cpu:   0%|          | 0/250 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/1185 [00:00<?, ?batch/s]

KeyboardInterrupt: ignored

In [None]:
result.save_to_directory('doctests/test_unstratified_complex')

## Generate predictions

In [None]:
import torch
from pykeen.models import Model
from pykeen import predict

from pykeen.datasets import get_dataset
from pykeen.predict import predict_target

# Get the trained model
model = result.model

# Create a mapping for relations
relation_label_to_id = result.training.relation_to_id

# Find the relation id for "named after"
named_after_id = relation_label_to_id["P138"]

# use conferences relations for the Nations dataset, remove this when used for named after and Wikidata
# named_after_id = relation_label_to_id["conferences"]

# Get the relation label
named_after_label = result.training.relation_id_to_label[named_after_id]

# Prepare a dictionary to store predictions
predictions = {}

# Loop over all entities
numOfAllEntities = model.num_entities
i = 1
for head_id, head_label in result.training.entity_id_to_label.items():
    # Use `predict_target` to predict the tail entity
    prediction_df = predict.predict_target(
        model=model,
        head=head_label,
        relation=named_after_label,
        triples_factory=result.training,
    ).df

    id = head_label

    # Create parameters
    params = {
                'action': 'wbgetentities',
                'ids':id,
                'format': 'json',
                'languages': 'en'
            }

    # fetch the API
    data = fetch_wikidata(params)

    # Show response
    data = data.json()
    head_id_label = data['entities'][id]['labels']['en']['value']

    f = open("predicted.txt", "a")
    line = "Predictions for " + head_label + " with label " + head_id_label + ":\n"
    f.write(line)
    f.close()

    predicted = prediction_df.head(5)
    for index, row in predicted.iterrows():
        score = row['score']
        tail = row['tail_label']
        params = {
              'action': 'wbgetentities',
              'ids':tail,
              'format': 'json',
              'languages': 'en'
          }
        tailData = fetch_wikidata(params)

        data = tailData.json()
        tail_id_label = data['entities'][tail]['labels']['en']['value']
        f = open("gosho.csv", "a")
        line = "Predicted tail " + tail + " with score " + str(score) + " with tail label " + tail_id_label + "\n"
        f.write(line)
        f.close()

    f = open("gosho.csv", "a")
    f.write("\n\n")
    f.close()

    # Store the prediction data frame
    predictions[head_label] = prediction_df.head(5)
    if (i % 100) == 0:
        print(f'Generated predictions for entity {i} out of {numOfAllEntities}')
    i += 1
