In [1]:
# load graph
import json
from collections import Counter, defaultdict
import numpy as np

from hdt import HDTDocument, TripleComponentRole

from settings import *
from predicates import properties


hdt_file = 'wikidata2018_09_11.hdt'
kg = HDTDocument(hdt_path+hdt_file)
namespace = 'predef-wikidata2018-09-all'
PREFIX_E = 'http://www.wikidata.org/entity/'
PREFIX_P = 'http://www.wikidata.org/prop/'

# prepare to retrieve all adjacent nodes including literals
predicates_ids = []
kg.configure_hops(1, predicates_ids, namespace, True, False)

# load all predicate labels

relationid2label = {}
for p in properties['results']['bindings']:
    _id = p['property']['value'].split('/')[-1]
    label = p['propertyLabel']['value']
    relationid2label[_id] = label

# load the training dataset
train_conversations_path = './data/train_set/train_set_ALL.json'

with open(train_conversations_path, "r") as data:
        conversations = json.load(data)
print("%d conversations loaded"%len(conversations))

# model init
import torch
from transformers import BertTokenizer, BertConfig

from MPBert_sampler_model import MessagePassingHDTBert
from utils import adj

# fix random seed for reproducibility
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# load all predicate labels
from predicates import properties

relationid2label = {}
for p in properties['results']['bindings']:
    _id = p['property']['value'].split('/')[-1]
#     print(_id)
    label = p['propertyLabel']['value']
    relationid2label[_id] = label
    
# all unique predicate labels
all_predicate_labels = list(relationid2label.values())
all_predicate_ids = [kg.string_to_global_id(PREFIX_P + p, TripleComponentRole.PREDICATE) for p in list(relationid2label.keys())]
assert len(all_predicate_labels) == len(all_predicate_ids)
# print(all_predicate_ids[0])

# model configuration
model_name = 'bert-base-uncased'
num_labels = 1
num_entities = 12605  # size of the output layer, i.e., maximum number of entities in the subgraph that are candidate answers
num_relations = 7337  # total number of all possible relations in the Wikidata KG TODO is it too big??
tokenizer = BertTokenizer.from_pretrained(model_name)
config = BertConfig.from_pretrained(model_name, num_labels=num_labels)

model = MessagePassingHDTBert(config, num_entities, num_relations, mp_layer=True)
# run model on the GPU
# model.cuda()

print("%d unique predicate labels in the KG"%num_relations)

6720 conversations loaded
7337 unique predicate labels in the KG


In [2]:
# check how many times an answer to the question fall into the initial (seed) subgraph separately for each order in the question sequence

max_triples = 50000000
offset = 0

# collect only samples where the answer is entity and it is adjacent to the seed entity
train_dataset = []

graph_sizes = []
max_n_edges = 2409 # max size of the graph allowed in the number of edges


rdfsLabelURI='http://www.w3.org/2000/01/rdf-schema#label'

def lookup_entity_labels(entity_ids):
    # prepare mapping tensors with entity labels and ids
    entity_labels, s_entity_ids = [], []
    for i, e_id in enumerate(entity_ids):
        e_uri = kg.global_id_to_string(e_id, TripleComponentRole.OBJECT)
        (triples, cardinality) = kg.search_triples(e_uri, rdfsLabelURI, "")
        if cardinality > 0:
            label = triples.next()[2]
            # strip language marker
            label = label.split('"')[1]
            entity_labels.append(label)
            s_entity_ids.append(e_id)

    assert len(entity_labels) == len(s_entity_ids)
    return entity_labels, s_entity_ids

# consider a sample of the dataset
n_limit = None
if n_limit:
    conversations = conversations[:n_limit]

n_entities = []
n_edges = []

train_dataset = []

for conversation in conversations[:100]:
    # consider 1st questions only
    for i in range(len(conversation['questions']))[:1]:
        
        question = conversation['questions'][i]['question']
        answer = conversation['questions'][i]['answer']
        # use oracle for the correct initial entity
        seed_entity = conversation['seed_entity'].split('/')[-1]
        seed_entity_id = kg.string_to_global_id(PREFIX_E+seed_entity, TripleComponentRole.OBJECT)

        # retrieve all adjacent nodes including literals
        subgraph = kg.compute_hops([seed_entity_id], max_triples, offset)
        entity_ids, predicate_ids, adjacencies = subgraph
        assert len(predicate_ids) == len(adjacencies)
    #         print("conversation")
        
        # consider only answers which are entities
        if ('www.wikidata.org' in answer):
            answer_id = kg.string_to_global_id(PREFIX_E+answer.split('/')[-1], TripleComponentRole.OBJECT)
            in_subgraph = answer_id in entity_ids
        
            # retain samples with answer outside the seed subgraph
            if not in_subgraph:
                
                p_input_ids = []
                p_token_type_ids = []
                p_attention_masks = []
                
                # prepare input of questions concatenated with all relation labels in the KG as candidates
                for p_label in all_predicate_labels:

                    # encode a text pair of the question with a predicate label
                    encoded_dict = tokenizer.encode_plus(question, p_label,
                                                         add_special_tokens=True,
                                                         max_length=64,
                                                         pad_to_max_length=True,
                                                         return_attention_mask=True,
                                                         return_token_type_ids=True)
                    p_input_ids.append(encoded_dict['input_ids'])
                    p_token_type_ids.append(encoded_dict['token_type_ids'])
                    p_attention_masks.append(encoded_dict['attention_mask'])
                    

                # prepare input of questions concatenated with node labels as candidates: get labels for all candidate entities in the seed subgraph
                entity_labels, entity_ids = lookup_entity_labels(entity_ids)
                # create a batch of samples for each entity label separately
                e_input_ids = []
                e_token_type_ids = []
                e_attention_masks = []
                for e_label in entity_labels:
                    # encode a text pair of the question with a predicate label
                    encoded_dict = tokenizer.encode_plus(question, e_label,
                                                         add_special_tokens=True,
                                                         max_length=64,
                                                         pad_to_max_length=True,
                                                         return_attention_mask=True,
                                                         return_token_type_ids=True)
                    e_input_ids.append(encoded_dict['input_ids'])
                    e_token_type_ids.append(encoded_dict['token_type_ids'])
                    e_attention_masks.append(encoded_dict['attention_mask'])
                assert len(e_input_ids) == len(entity_ids)
                train_dataset.append([[torch.tensor(e_input_ids), torch.tensor(e_token_type_ids),
                                       torch.tensor(e_attention_masks), torch.tensor(entity_ids)],
                                      [torch.tensor(p_input_ids), torch.tensor(p_token_type_ids),
                                       torch.tensor(p_attention_masks), torch.tensor(all_predicate_ids)],
                                      torch.tensor([answer_id])])


print("Compiled dataset with %d samples" % len(train_dataset))

Compiled dataset with 4 samples


In [3]:
# training setup
from transformers import get_linear_schedule_with_warmup, AdamW

epochs = 4
total_steps = len(train_dataset) * epochs

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                 )
# learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [4]:
# train model (matching nodes and relations with a Transformer with subgraph sampling)
import random
import numpy as np

# use CPU to train the model
device = torch.device("cpu")

print("%d training examples"%(len(train_dataset)))
# print("%d validation examples"%(len(valid_dataset)))

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    
    # reset the total loss for this epoch
    total_train_loss = 0
    
    # put the model into training mode
    model.train()
    
    # for each sample of training data input as a batch of size 1
    for step, batch in enumerate(train_dataset):
        print(step)
        e_inputs = [tensor.to(device) for tensor in batch[0]]
        p_inputs = [tensor.to(device) for tensor in batch[1]]
        labels = batch[2].to(device)
        
        model.zero_grad()
        
        # forward pass
        loss, logits = model(e_inputs,
                             p_inputs,
                             labels)
#         print(loss.item())
        # accumulate the training loss over all of the batches
        total_train_loss += loss.item()
        
        if not loss.item() == 0:
            # backward pass
            loss.backward()
            print("Backprop")
        
        # clip gradient to prevent exploding
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # update parameters
        optimizer.step()
        scheduler.step()
    
    # training epoch is over here
    
    # calculate average loss over all the batches
    avg_train_loss = total_train_loss / len(train_dataset) 
    print("Average training loss: {0:.2f}".format(avg_train_loss))


4 training examples

Training...
0
tensor([5290])
torch.Size([1, 18894])


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn