# Test Message-Passing Transformer

In [1]:
# test sparse matrix multiplication functions used for MP 
import torch

# random sparse matrix
A = torch.randn(2, 3).to_sparse().requires_grad_(True)
print(A)
print(A.shape)

# scalar multiplication
b = torch.randn(1, 1, requires_grad=True).item()
print(b)
y = A * b
print(y)
print(y.shape)


# dense vector multiplication
b = torch.ones(3, 1, requires_grad=True)
print(b)
print(b.shape)
y = torch.sparse.mm(A, b)
print(y)
print(y.shape)

tensor(indices=tensor([[0, 0, 0, 1, 1, 1],
                       [0, 1, 2, 0, 1, 2]]),
       values=tensor([-0.6693, -1.3099, -0.9233, -0.3582,  0.0946,  1.4149]),
       size=(2, 3), nnz=6, layout=torch.sparse_coo, requires_grad=True)
torch.Size([2, 3])
-0.05607571452856064
tensor(indices=tensor([[0, 0, 0, 1, 1, 1],
                       [0, 1, 2, 0, 1, 2]]),
       values=tensor([ 0.0375,  0.0735,  0.0518,  0.0201, -0.0053, -0.0793]),
       size=(2, 3), nnz=6, layout=torch.sparse_coo, grad_fn=<MulBackward0>)
torch.Size([2, 3])
tensor([[1.],
        [1.],
        [1.]], requires_grad=True)
torch.Size([3, 1])
tensor([[-2.9025],
        [ 1.1513]], grad_fn=<SparseAddmmBackward>)
torch.Size([2, 1])


In [2]:
# model init
import torch
from transformers import BertTokenizer, BertConfig

from MPBert_model import MessagePassingBert


# fix random seed for reproducibility
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# model configuration
model_name = 'bert-base-uncased'
num_labels = 1
num_entities = 12605  # size of the output layer, i.e., maximum number of entities in the subgraph that are candidate answers 
tokenizer = BertTokenizer.from_pretrained(model_name)
config = BertConfig.from_pretrained(model_name, num_labels=num_labels)

model = MessagePassingBert(config, num_entities)
# run model on the GPU
# model.cuda()

# input data pre-process adjacency matrix
import numpy as np
import scipy.sparse as sp


def generate_adj_sp(edges, n_entities, include_inverse=True):
    '''
    Build adjacency matrix (sparse)
    '''
    adj_shape = (n_entities, n_entities)
    # colect all predicate matrices separately into a list
#     sp_adjacencies = []
#     for edges in adjacencies:
        # split subject (row) and object (col) node URIs
    n_edges = len(edges)
    row, col = np.transpose(edges)

    # duplicate edges in the opposite direction
    if include_inverse:
        _row = np.hstack([row, col])
        col = np.hstack([col, row])
        row = _row
        n_edges *= 2

    # create adjacency matrix for this predicate
    data = np.ones(n_edges)
    adj = sp.csr_matrix((data, (row, col)), shape=adj_shape)#.todense()
#     sp_adjacencies.append(adj)
    return adj
#     return np.asarray(sp_adjacencies)

def to_torch_sparse_tensor(M):
    M = M.tocoo().astype(np.float32)
    indices = torch.from_numpy(np.vstack((M.row, M.col))).long()
    values = torch.from_numpy(M.data)
    shape = torch.Size(M.shape)
    T = torch.sparse.FloatTensor(indices, values, shape)
    return T

In [3]:
# test inference with a sample input, where input is a question and a predicate label along with the list of edges for this predicate
question1 = "Hello, my dog is cute"
adjacencies = [(0, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]
output = 1

adjacencies = generate_adj_sp(adjacencies, num_entities)

# build input tensors
input_ids = torch.tensor(tokenizer.encode(question1)).unsqueeze(0)  # Batch size 1
graph = to_torch_sparse_tensor(adjacencies)
labels = torch.tensor([output]).unsqueeze(0)  # Batch size 1

# run inference
outputs = model(input_ids, graph, labels=labels)
loss, logits = outputs[:2]
print(loss, logits)

tensor(15.4030, grad_fn=<NllLossBackward>) tensor([[-0.2484],
        [-5.9617],
        [-0.2484],
        ...,
        [ 0.0000],
        [ 0.0000],
        [ 0.0000]], grad_fn=<SparseAddmmBackward>)


In [4]:
# train model
model.train()
outputs = model(input_ids, graph, labels=labels)
loss = outputs[0]
current_loss = loss.item()
print(current_loss)

14.08836555480957


# Prepare the Dataset

In [5]:
# load dataset
import json
conversations_path = './data/train_set/train_set_ALL.json'

with open(conversations_path, "r") as data:
    conversations = json.load(data)
print("%d conversations loaded"%len(conversations))

# load graph
from hdt import HDTDocument, TripleComponentRole
from settings import *

hdt_file = 'wikidata2018_09_11.hdt'
kg = HDTDocument(hdt_path+hdt_file)
namespace = 'predef-wikidata2018-09-all'
PREFIX_E = 'http://www.wikidata.org/entity/'

# prepare to retrieve all adjacent nodes including literals
predicates_ids = []
kg.configure_hops(1, predicates_ids, namespace, True, False)

# load all predicate labels
from predicates import properties

relationid2label = {}
for p in properties['results']['bindings']:
    _id = p['property']['value'].split('/')[-1]
    label = p['propertyLabel']['value']
    relationid2label[_id] = label

# print(relationid2label)

6720 conversations loaded


In [7]:
from collections import Counter, defaultdict


def lookup_predicate_labels(predicate_ids):
    p_labels_map = defaultdict(list)
    for p_id in predicate_ids:
        p_uri = kg.global_id_to_string(p_id, TripleComponentRole.PREDICATE)
        label = p_uri.split('/')[-1]
        if label in relationid2label:
            label = relationid2label[label]
        else:
            label = label.split('#')[-1]
        p_labels_map[label].append(p_id)
    return p_labels_map


answers_in_subgraph = Counter()

def check_answer_in_subgraph(conversation, subgraph):
    answer1 = conversation['questions'][0]['answer']
    # consider only answers which are entities
    if ('www.wikidata.org' in answer1):
        answer1_id = kg.string_to_global_id(PREFIX_E+answer1.split('/')[-1], TripleComponentRole.OBJECT)
        in_subgraph = answer1_id in entity_ids
        answers_in_subgraph.update([in_subgraph])
        # consider only answer entities that are in the subgraph
        if in_subgraph:
            answer1_idx = entity_ids.index(answer1_id)
            return answer1_idx


max_triples = 50000000
offset = 0

# collect only samples where the answer is entity and it is adjacent to the seed entity
# input_ids = []
# attention_masks = []
# token_type_ids = []
# graphs = []
# labels = []
train_dataset = []

graph_sizes = []
max_n_edges = 2409 # max size of the graph allowed in the number of edges


for conversation in conversations[:4]:
    question1 = conversation['questions'][0]['question']
    # use oracle for the correct initial entity
    seed_entity = conversation['seed_entity'].split('/')[-1]
    seed_entity_id = kg.string_to_global_id(PREFIX_E+seed_entity, TripleComponentRole.OBJECT)
    
    # retrieve all adjacent nodes including literals
    subgraph1 = kg.compute_hops([seed_entity_id], max_triples, offset)
    entity_ids, predicate_ids, adjacencies = subgraph1
    assert len(predicate_ids) == len(adjacencies)

    # check that the answer is in the subgraph
    answer1_idx = check_answer_in_subgraph(conversation, entity_ids)
    if answer1_idx:
        # get labels for all candidate predicates
        p_labels_map = lookup_predicate_labels(predicate_ids)
        
        # create a sample for each predicate label separately
        for p_label, p_ids in p_labels_map.items():
            
            # encode a text pair of the question with a predicate label
            encoded_dict = tokenizer.encode_plus(question1, p_label,
                                                 add_special_tokens=True,
                                                 max_length=64,
                                                 pad_to_max_length=True,
                                                 return_attention_mask=True,
                                                 return_token_type_ids=True)
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
            token_type_ids.append(encoded_dict['token_type_ids'])
            
            # get adjacencies only for the predicates sharing the same label
            selected_adjacencies = []
            for p_id in p_ids:
                p_id_idx = predicate_ids.index(p_id)
#                 print(len(adjacencies[p_id_idx]))
                # add edges for each predicate separately
#                 selected_adjacencies.append(adjacencies[p_id_idx])
                # add all edges together
                for edge in adjacencies[p_id_idx]:
                    if edge not in selected_adjacencies:
#                         print(edge)
                        selected_adjacencies.append(edge)
            
            A = generate_adj_sp(selected_adjacencies, num_entities, include_inverse=True)
            # TODO normalise graph size: pad to the maximum number of entities
            # num_entities
#             n_edges = len(A)
#             graph_sizes.append(n_edges)
#             print(n_edges)
#             print([len(a) for a in selected_adjacencies])
#             selected_adjacencies[0]
            # pad to max number of edges
#             selected_adjacencies += [(0, 0)] * (max_n_edges - n_edges)
#             print(selected_adjacencies)
#             graphs.append(to_torch_sparse_tensor(A))
#             labels.append(correct_answer_vector)
            train_dataset.append([torch.tensor([encoded_dict['input_ids']]),
                                  torch.tensor([encoded_dict['token_type_ids']]),
                                  torch.tensor([encoded_dict['attention_mask']]),
                                  to_torch_sparse_tensor(A),
                                  torch.tensor([answer1_idx])])

# print(max(graph_sizes))
print(answers_in_subgraph)
print("Compiled dataset with %d samples"%len(input_ids))

Counter({True: 1})
Compiled dataset with 40 samples


In [8]:
# training setup
from transformers import get_linear_schedule_with_warmup, AdamW

epochs = 4
total_steps = len(train_dataset) * epochs

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                 )
# learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)


In [None]:
# train model
import random
import numpy as np

# set the seed value to make it reproducible
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# use CPU to train the model
device = torch.device("cpu")


for epoch_i in range(0, epochs):
    # put the model into training mode
    model.train()
    
    # for each sample of training data input as a batch of size 1
    for step, batch in enumerate(train_dataset):
        b_input_ids = batch[0].to(device)
        b_token_mask = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_graphs = batch[3].to(device)
        b_labels = batch[4].to(device)
#         print(b_input_ids.shape)
#         print(b_labels.shape)
        model.zero_grad()
        # forward pass
        loss, logits = model(b_input_ids,
                             b_graphs,
                             token_type_ids=b_token_mask,
                             attention_mask=b_input_mask,
                             labels=b_labels)
#         print(loss)
        # backward pass
        loss.backward()
        
        # update parameters
        optimizer.step()
        scheduler.step()
        # TODO monitor training progress

tensor(10.4202, grad_fn=<NllLossBackward>)
tensor(9.4415, grad_fn=<NllLossBackward>)
tensor(9.4416, grad_fn=<NllLossBackward>)
tensor(9.4423, grad_fn=<NllLossBackward>)
tensor(9.4418, grad_fn=<NllLossBackward>)
tensor(9.4800, grad_fn=<NllLossBackward>)
tensor(9.4419, grad_fn=<NllLossBackward>)
tensor(9.4418, grad_fn=<NllLossBackward>)
tensor(9.4419, grad_fn=<NllLossBackward>)
tensor(9.4418, grad_fn=<NllLossBackward>)
tensor(9.4419, grad_fn=<NllLossBackward>)
tensor(9.4418, grad_fn=<NllLossBackward>)
tensor(9.4417, grad_fn=<NllLossBackward>)
tensor(9.4419, grad_fn=<NllLossBackward>)
tensor(9.4419, grad_fn=<NllLossBackward>)
tensor(9.4419, grad_fn=<NllLossBackward>)
tensor(9.4418, grad_fn=<NllLossBackward>)
tensor(9.4419, grad_fn=<NllLossBackward>)
tensor(11.1034, grad_fn=<NllLossBackward>)
tensor(9.4419, grad_fn=<NllLossBackward>)
tensor(9.4419, grad_fn=<NllLossBackward>)
tensor(9.4418, grad_fn=<NllLossBackward>)
tensor(9.4418, grad_fn=<NllLossBackward>)
tensor(9.4418, grad_fn=<NllLossB