# Generate synthetic dataset

In [33]:
import random
import numpy as np

from predicates_dictionary import predicates

# number of entities in the subgraph
MINN = 20
MAXN = 200

N_PREDICATES = 10
N_EDGES = 1

samples = []
# generate questions using predicate labels as seeds
for p_id, p_label in predicates.items():
    # prepare sample
    adjacencies, p_labels = [], []
    
    q = 'who is %s?' % p_label
    
    # generate random subgraph
    n = np.random.randint(MINN, MAXN+1)
    
    # pick a seed at random
    seed = np.random.randint(0, n)
    
    # pick an answer at random which is not a seed
    answer = seed
    while answer == seed:
        answer = np.random.randint(0, n)
    
    p_labels = random.sample(list(predicates.values()), N_PREDICATES)
    p_labels.append(p_label)
    p_labels = list(set(p_labels))
    
    # generate other adjacency matrices of the same size for other predicates
    for i, p in enumerate(p_labels):
        
        # sample edges
        edges = []
        for _ in range(N_EDGES - 1):
            edges.append((np.random.randint(0, n), np.random.randint(0, n)))
        
        if p == p_label:
            # make sure there is an edge between the answer and the seed for the correct predicate
            edges.append((seed, answer))
            p_idx = i
       
        edges = list(set(edges))
#         print(edges)
        adjacencies.append(edges)
    
    assert len(adjacencies) == len(p_labels)
    
    samples.append([q, p_labels, adjacencies, seed, answer, n, p_label, p_idx])
#     break
    
print(len(samples))

6994


In [34]:
# show sample
q, p_labels, adjacencies, seed, answer, n_entities, p_label, p_idx = samples[27]
print(q)
print(p_label)
print(len(adjacencies[0]))

print(len(adjacencies))
print(len(p_labels))

who is member of sports team?
member of sports team
0
11
11


# Prepare data for training and validation

In [35]:
from transformers import DistilBertTokenizer

# pre-processing specs
n_training_samples = 5000
max_seq_len = 200
model_name = 'distilbert-base-uncased'

# tokenizer init
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

In [36]:
# shuffle and split dataset intro training and validation
random.shuffle(samples)
train_samples = samples[:n_training_samples]
dev_samples = samples[n_training_samples:]
print(len(dev_samples))

1994


In [37]:
import torch

from utils import adj


def prepare_dataset(samples):
    # prepare tensors for model input
    dataset = []
    for sample in samples:
        question, p_labels, adjacencies, seed_idx, answer_idx, n_entities, p_label, p_idx = sample

        # create a batch of samples for each predicate label separately
        input_ids = []
        attention_masks = []
        for p_label in p_labels:
            # encode a text pair of the question with a predicate label
            encoded_dict = tokenizer.encode_plus(question, p_label,
                                                 add_special_tokens=True,
                                                 max_length=max_seq_len,
                                                 pad_to_max_length=True,
                                                 return_attention_mask=True)
            inputs = encoded_dict['input_ids']
            attention_mask = encoded_dict['attention_mask']

            input_ids.append(inputs)
            attention_masks.append(attention_mask)

        # create a sparse adjacency matrix
        indices, relation_mask = adj(adjacencies, n_entities, len(adjacencies))

        # activate seed entity
        entities = torch.zeros(n_entities, 1)
        entities[[seed_idx]] = 1

        dataset.append([torch.tensor(input_ids),
                        torch.tensor(attention_masks),
                        [indices, relation_mask, entities],
                        torch.tensor([answer_idx])])
    return dataset


train_dataset = prepare_dataset(train_samples)
dev_dataset = prepare_dataset(dev_samples)

print("%d training examples"%(len(train_dataset)))
print("%d development examples"%(len(dev_dataset)))

5000 training examples
1994 development examples


# Evaluate model before fine-tuning

In [43]:
# fix random seed
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

del model

In [44]:
from transformers import DistilBertConfig

from MPDistilBert_model import MessagePassingBert


# model init
config = DistilBertConfig.from_pretrained(model_name, num_labels=1)
model = MessagePassingBert(config)

# freeze embeddings layer
for name, param in model.bert.named_parameters():                
    if name.startswith('embeddings'):
        param.requires_grad = False
        
# # freeze only the first k-1 layers of the Transformer
# k = 6
# ct = 0
# for child in model.bert.transformer.layer.children():
#     ct += 1
#     if ct < k:
#         for param in child.parameters():
#             param.requires_grad = False
#     else:
#         print("Not frozen Transformer layer")

# for name, param in model.named_parameters():                
#     if param.requires_grad:
#         print(name)

# run model on the GPU
device = torch.device("cuda")
model.to(device)

MessagePassingBert(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_

In [45]:
def run_inference(model, dataset, samples):
    # put model in evaluation mode
    model.eval()
    
    p1s = []  # measure accuracy of the top answer: P@1
    for i, batch in enumerate(dataset):
        sample = samples[i]
        question, p_labels, adjacencies, seed_idx, answer_idx, n_entities, p_label, p_idx = sample
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_graphs = [tensor.to(device) for tensor in batch[2]]
        b_labels = batch[3].to(device)
        
        with torch.no_grad():
            # forward pass
            loss, logits = model(b_input_ids,
                                 b_graphs,
                                 attention_mask=b_input_mask,
                                 labels=b_labels)
#             print(logits)
#             print(len(logits.cpu().numpy()))
            scores = logits.cpu().numpy()
#             print(np.sort(scores)[::-1][:5])
            predicted_idx = np.argmax(logits.cpu().numpy()).flatten()[0]
            true_idx = b_labels.cpu().numpy()[0]
            assert true_idx == answer_idx
#             print(predicted_label, true_label)
            p1 = int(predicted_idx == true_idx)
#             print(p1)
            p1s.append(p1)
    
    return p1s

dev_dataset = dev_dataset[:20]
p1s = run_inference(model, dev_dataset, dev_samples)
print("Dev set P@1: %.2f" % np.mean(p1s))

Dev set P@1: 0.00


# Fine-tune model on training split

In [46]:
# training specs
from transformers import get_linear_schedule_with_warmup, AdamW

epochs = 2
device = 'cuda'

total_steps = len(train_dataset) * epochs

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8
                 )
# learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)


device = torch.device("cuda")
model.to(device)

MessagePassingBert(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_

In [47]:
import gc

# reduce data size
dev_dataset = dev_dataset[:]
train_dataset = train_dataset[:100]

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # reset the total loss for this epoch
    total_train_loss = 0

    # put the model into training mode
    model.train()

    # for each sample of training data input as a batch of size 1
    for step, batch in enumerate(train_dataset):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_graphs = [tensor.to(device) for tensor in batch[2]]
        b_labels = batch[3].to(device)
        model.zero_grad()
#         print([tensor.shape for tensor in batch[3]])
        
        # forward pass
        loss, logits = model(b_input_ids,
                             b_graphs,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        
        del b_input_ids, b_graphs, b_input_mask, b_labels, logits
        
        # accumulate the training loss over all of the batches
        total_train_loss += loss.item()
        
        # clean up
        gc.collect()
        torch.cuda.empty_cache()

        # backward pass
        loss.backward()
        
        # clip gradient to prevent exploding
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()
        scheduler.step()
        
        # clean up
        gc.collect()
        torch.cuda.empty_cache() 

    # training epoch is over here

    # calculate average loss over all the batches
    avg_train_loss = total_train_loss / len(train_dataset) 
    print("  Average training loss: {0:.2f}".format(avg_train_loss))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    # put the model in evaluation mode
    model.eval()

    total_eval_loss = 0

    # evaluate data for one epoch
    for step, batch in enumerate(dev_dataset):

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_graphs = [tensor.to(device) for tensor in batch[2]]
        b_labels = batch[3].to(device)
        
        with torch.no_grad():
            # forward pass
            loss, logits = model(b_input_ids,
                                 b_graphs,
                                 attention_mask=b_input_mask,
                                 labels=b_labels)
            # accumulate validation loss
            total_eval_loss += loss.item()

    avg_val_loss = total_eval_loss / len(dev_dataset)
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))


Training...
  Average training loss: 0.13

Running Validation...
  Validation Loss: 0.00

Training...
  Average training loss: 0.00

Running Validation...
  Validation Loss: 0.00


# Evaluate model after fine-tuning

In [48]:
dev_dataset = dev_dataset[:20]
p1s = run_inference(model, dev_dataset, dev_samples)
print("Dev set P@1: %.2f" % np.mean(p1s))

Dev set P@1: 1.00
