In [3]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import requests
import os
import psutil
from functions import *
from pykeen.datasets import WN18RR
from pykeen.nn.init import PretrainedInitializer
from pykeen import pipeline
import wandb
from pykeen.trackers import WANDBResultTracker

In [4]:
device = "cuda"

In [None]:
#WANDBResultTracker(project="kge_train")

In [None]:
wandb.login()
run = wandb.init(project="kge_train2")

In [None]:
print("Virtual Memory percent: ", psutil.virtual_memory().percent)
print('GPU:  Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB |', 
          'Cached: ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

#### 1. Load BERT word embeddings 

In [5]:
path = 'nlm_embeddings/bert_wn18rr'
ent_embedd_raw, rel_embedd_raw = load_ent_embeddings(path, device)

['.ipynb_checkpoints', '00_bert_4lastlayers_wn18rr_rel.pt', '01_bert_4lastlayers_wn18rr_ent.pt', '02_bert_4lastlayers_wn18rr_ent.pt', '03_bert_4lastlayers_wn18rr_ent.pt', '04_bert_4lastlayers_wn18rr_ent.pt', '05_bert_4lastlayers_wn18rr_ent.pt', '06_bert_4lastlayers_wn18rr_ent.pt', '07_bert_4lastlayers_wn18rr_ent.pt', '08_bert_4lastlayers_wn18rr_ent.pt', '09_bert_4lastlayers_wn18rr_ent.pt', '10_bert_4lastlayers_wn18rr_ent.pt', '11_bert_4lastlayers_wn18rr_ent.pt', '12_bert_4lastlayers_wn18rr_ent.pt', '13_bert_4lastlayers_wn18rr_ent.pt', '14_bert_4lastlayers_wn18rr_ent.pt', '15_bert_4lastlayers_wn18rr_ent.pt', '16_bert_4lastlayers_wn18rr_ent.pt', '17_bert_4lastlayers_wn18rr_ent.pt']


In [6]:
# Raw embeddings are saved into batches → Transform dimensions so that there are 4 hidden layers each containing all entity tensors 
embeddings = []
for layer in range(len(ent_embedd_raw[0])):
    lst = [ent_embedd_raw[i][layer] for i in range(len(ent_embedd_raw))]
    layer_embdd = torch.cat(lst, dim=0)
    embeddings.append(layer_embdd)
del ent_embedd_raw # Clear memory 

# Concatenate 4 last layers for each entity 
ent_embedd_concat = concat_hidden_states(embeddings)
del embeddings # Clear memory 
ent_embedd_concat.size()

torch.Size([40943, 3072])

In [7]:
relation_embedd = concat_hidden_states(rel_embedd_raw)

In [8]:
relation_embedd.size()

torch.Size([11, 3072])

In [9]:
ent_embedd_concat.device

device(type='cuda', index=0)

#### 2. Map LM generated tensors and pykeen dataset sequence

##### 2.1 Load dataset from pykeen & entity text description from data directory

In [10]:
dataset = WN18RR()

df_entity2text = pd.read_csv('data/wn18rr_entity2text.txt', delimiter="\t", header = None, names=["id", "definition"])
df_entity2text[["entity", "description"]] = df_entity2text["definition"].str.split(',', n=1, expand=True)
df_entity2text.id = df_entity2text.id.astype(str)
df_entity2text["id"] = df_entity2text["id"].str.rjust(8, '0')
df_entity2text[:3]

Unnamed: 0,id,definition,entity,description
0,14854262,"stool, solid excretory product evacuated from ...",stool,solid excretory product evacuated from the bo...
1,590383,"chieftainship, the position of chieftain",chieftainship,the position of chieftain
2,8769179,"saxony, an area in Germany around the upper El...",saxony,an area in Germany around the upper Elbe rive...


##### 2.2 Store entity and relation ids to list and map to BERT generated tensors

In [11]:
ent_id_list = list(dataset.entity_to_id.keys())
rel_id_list = list(dataset.relation_to_id.keys())

You're trying to map triples with 212 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 210 from 3134 triples were filtered out


In [13]:
entity_mapping = {}
for item in ent_id_list:
    # Get index of item in df entity2text df 
    idx = df_entity2text[df_entity2text["id"] == item].index.item()
    
    # Add tensor to dictionary, whereby key is the entity id 
    entity_mapping[item] = ent_embedd_concat[idx]
    
to_list  = list(entity_mapping.values())
entity_embedd = torch.stack(to_list, 0)

In [14]:
entity_embedd = entity_embedd.to(device)
relation_embedd = relation_embedd.to(device)

In [None]:
#tracker = WANDBResultTracker(project="kge_train")
#tracker.start_run(run_name="test")

In [None]:
#training_triples_factory = dataset.training

In [None]:
#training_triples_factory

##### Own Pipeline

In [None]:
# Pick a model
from pykeen.models import TransE
model = TransE(triples_factory=training_triples_factory, 
               #embedding_dim=entity_embedd.shape[-1],
               #entity_initializer=PretrainedInitializer(tensor=entity_embedd),
               #relation_initializer=PretrainedInitializer(tensor=relation_embedd)
              )
model.to(device)


# Pick an optimizer from Torch
from torch.optim import Adam
optimizer = Adam(params=model.get_grad_params())

# Pick a training approach (sLCWA or LCWA)
from pykeen.training import SLCWATrainingLoop
training_loop = SLCWATrainingLoop(
    model = model,
    triples_factory = training_triples_factory,
    optimizer = optimizer,
    #result_tracker = tracker
    #result_tracker_kwargs=dict(project='kge_train')
)


# Train the model
model_train = training_loop.train(
    triples_factory = training_triples_factory,
    num_epochs = 3,
    batch_size = 256
)

In [None]:
# Pick an evaluator
from pykeen.evaluation import RankBasedEvaluator
evaluator = RankBasedEvaluator()

# Get triples to test
mapped_triples = dataset.testing.mapped_triples

# Evaluate
results = evaluator.evaluate(
    model = model,
    mapped_triples = mapped_triples,
    batch_size=10,
    additional_filter_triples=[
            dataset.training.mapped_triples,
            dataset.validation.mapped_triples]
)

In [None]:
results.get_metric('hits_at_10') 
results.get_metric('mean_rank')

##### Pykeen Pipeline 

In [15]:
from pykeen.pipeline import pipeline

In [None]:
# Train KGE model with input data → Save results
result = pipeline(
    dataset="wn18rr",
    dataset_kwargs=dict(create_inverse_triples=False),
    model="transe",
    model_kwargs=dict(
        #automatic_memory_optimization=True,
        #embedding_dim=256,
        scoring_fct_norm=1,
        embedding_dim=entity_embedd.shape[-1],
        entity_initializer=PretrainedInitializer(tensor=entity_embedd),
        relation_initializer=PretrainedInitializer(tensor=relation_embedd)
    ),
    result_tracker='wandb',
    result_tracker_kwargs=dict(
        project='transE',
        #experiment='SGD_lr1e-5'
    ),
    optimizer='adam',
    optimizer_kwargs=dict(lr=0.0018, weight_decay=0.0),
    loss='softplus',
    training_loop='lcwa',
    training_kwargs=dict(num_epochs=10, batch_size=256, label_smoothing=0.00200051768009458),
    regularizer="no",
    evaluator="rankbased",
    evaluator_kwargs=dict(filtered=True),
    evaluation_kwargs=dict(batch_size=16)
)

#result.save_to_directory("01_models/nations_transE_word2vec_no1")

No random seed is specified. Setting to 2919109088.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mvjolacl[0m ([33mnlm_kgc[0m). Use [1m`wandb login --relogin`[0m to force relogin


INFO:pykeen.training.training_loop:Starting sub_batch_size search for training now...
INFO:pykeen.training.training_loop:Concluded search with sub_batch_size 8.


Training epochs on cuda:0:   0%|          | 0/10 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/245 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/245 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/245 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/245 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/245 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/245 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/245 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/245 [00:00<?, ?batch/s]

In [None]:
result.save_to_directory("models/wn18rr_transE_random_benchmarkparam.pt")

In [None]:
hitsat10 = result.get_metric("hits_at_10")
mr = result.get_metric("mean_rank")
mrr = result.get_metric("mean_reciprocal_rank")
key_metrics = {"hits_at_10": hitsat10, "mean_rank": mr, "mean_reciprocal_rank": mrr}

In [None]:
hitsat10

In [None]:
mr 

In [None]:
mrr 

##### HPO Pipeline

In [None]:
from pykeen.hpo import hpo_pipeline

In [None]:
hpo_pipeline_result = hpo_pipeline(
    dataset='wn18rr',
    model='TransE',
    epochs=5,
    device=device,
    n_trials=10,
    result_tracker='wandb',
    result_tracker_kwargs=dict(
        project='kge_train',
        #experiment='new run',
        #reinit=True,
    ),
    #model_kwargs_ranges=dict(
        #embedding_dim=entity_embedd.shape[-1],
        #entity_initializer=PretrainedInitializer(tensor=entity_embedd),
        #relation_initializer=PretrainedInitializer(tensor=relation_embedd)
    #),
)

In [None]:
#hpo_pipeline_result.save_to_directory('hpo_pipeline/transE_wn18rr_randinit.pt')