In [1]:
import random
import numpy as np

from predicates_dictionary import predicates

# number of entities in the subgraph
MINN = 20
MAXN = 200

N_PREDICATES = 10
N_EDGES = 1

# prepare entity labels for each hop up to MAXN
entity_labels_1, entity_labels_2 = [], []
for i in range(MAXN):
    entity_labels_1.append('Entity1%d'%i)
    entity_labels_2.append('Entity2%d'%i)

# consider all possible predicates
p_labels = list(predicates.values())

samples = []
# generate questions for each predicate
for p_idx, p_label in enumerate(p_labels):
    # generate random seed subgraph with n entities
    n = np.random.randint(MINN, MAXN+1)
    # sample n entities for this subgraph
    e_labels_1 = random.sample(entity_labels_1, n)
    # pick a seed at random
    seed_idx = np.random.randint(0, n)
    seed_label = e_labels_1[seed_idx]
    
    # (1) ask first question about this predicate
    q1 = 'what is %s of %s?' % (p_label, seed_label)
    # sample random answer to the first question from the seed subgraph
    a1_idx = seed_idx
    while a1_idx == seed_idx:
        a1_idx = np.random.randint(0, n)
    a1 = e_labels_1[a1_idx]
        
    # (2) sample random entity from the seed subgraph to ask a question about
    e_idx = seed_idx
    while e_idx == seed_idx:
        e_idx = np.random.randint(0, n)
    e_label = e_labels_1[e_idx]
    # ask next question about this entity
    q2 = 'what about %s?' % e_label
    # pick an answer at random in the other subgraph
    a2_idx = np.random.randint(0, MAXN)
    
    # append conversation history to question
    q = [q2, q1, a1]
   
    # each sample contains: question with history, predicate and entity list, and answer
    samples.append([q, e_labels_1, e_idx, p_labels, p_idx, seed_idx, a2_idx])
#     break
    
print(len(samples))

6994


In [2]:
# show sample
q, e_labels_1, e_idx, p_labels, p_idx, seed_idx, a2_idx = samples[7]
print(q)
print(p_labels[p_idx])
print(len(p_labels))

['what about Entity1157?', 'what is place of birth of Entity121?', 'Entity115']
place of birth
6994


# Prepare data for training and validation

In [3]:
from transformers import AutoTokenizer


# pre-processing specs
n_training_samples = 5000
max_seq_len = 200
model_name = 'distilbert-base-uncased'

# tokenizer init
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [4]:
# shuffle and split dataset intro training and validation
random.shuffle(samples)
train_samples = samples[:n_training_samples]
dev_samples = samples[n_training_samples:]
print(len(dev_samples))

1994


In [5]:
import torch

def prepare_dataset(samples, limit=2):
    # prepare tensors for model input
    dataset = []
    for sample in samples[:limit]:
        question, e_labels_1, e_idx, p_labels, p_idx, seed_idx, a2_idx = sample

        # encode predicates
        p_batch = tokenizer([question]*len(p_labels), p_labels, padding=True, truncation=True, return_tensors="pt")
        # encode entities
        e_batch = tokenizer([question]*len(e_labels_1), e_labels_1, padding=True, truncation=True, return_tensors="pt")
      
        dataset.append([p_batch,
                        e_batch,
                        torch.tensor([a2_idx])])
    return dataset


train_dataset = prepare_dataset(train_samples)
dev_dataset = prepare_dataset(dev_samples)

print("%d training examples"%(len(train_dataset)))
print("%d development examples"%(len(dev_dataset)))

2 training examples
2 development examples


# Evaluate model before fine-tuning

In [6]:
# fix random seed
import random
import numpy as np

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# del model

In [8]:
from transformers import DistilBertConfig

from MPDistilBert_sampler_model import MessagePassingBert


# model init
config = DistilBertConfig.from_pretrained(model_name, num_labels=1)
model = MessagePassingBert(config)

# freeze embeddings layer
for name, param in model.bert.named_parameters():                
    if name.startswith('embeddings'):
        param.requires_grad = False

# run model on the GPU "cuda"
device = torch.device("cpu")
model.to(device)

MessagePassingBert(
  (bert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_

In [12]:
# TODO
def run_inference(model, dataset, samples):
    # put model in evaluation mode
    model.eval()
    
    p1s = []  # measure accuracy of the top answer: P@1
    for i, batch in enumerate(dataset):
        p_input_ids = batch[0].to(device)
        e_input_ids = batch[1].to(device)
        a_labels = batch[2].to(device)
        
        with torch.no_grad():
            # forward pass
            loss, logits = model(p_input_ids,
                                 e_input_ids,
                                 labels=a_labels)
#             print(logits)
#             print(len(logits.cpu().numpy()))
            scores = logits.cpu().numpy()
#             print(np.sort(scores)[::-1][:5])
            predicted_idx = np.argmax(logits.cpu().numpy()).flatten()[0]
            true_idx = b_labels.cpu().numpy()[0]
            
#             print(predicted_label, true_label)
            p1 = int(predicted_idx == true_idx)
#             print(p1)
            p1s.append(p1)
    
    return p1s

dev_dataset = dev_dataset[:20]
p1s = run_inference(model, dev_dataset, dev_samples)
print("Dev set P@1: %.2f" % np.mean(p1s))

TypeError: forward() got an unexpected keyword argument 'labels'