Example Pytorch code for what the paper's model might look like.
A quick tutorial on Pytorch can be found at https://cs230-stanford.github.io/pytorch-getting-started.html
There's also the Pytorch documentation intro at https://pytorch.org/tutorials/beginner/deep_learning_60min_blitz.html
which goes into a few more details.


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import DataLoader

torch.manual_seed(1)

import numpy as np



In [7]:
'''
Modified LSTM modal
This contains both the embedding layers, Pytorch's default LSTM layer,
and 2 linear layers
'''
class PathLSTM(nn.Module):

    def __init__(self, e_emb_dim, t_emb_dim, r_emb_dim, hidden_dim, vocab_size, tagset_size):
        super(PathLSTM, self).__init__()
        self.hidden_dim = hidden_dim

        self.entity_embeddings = nn.Embedding(vocab_size, e_emb_dim)
        self.type_embeddings = nn.Embedding(vocab_size, t_emb_dim)
        self.rel_embeddings = nn.Embedding(vocab_size, r_emb_dim)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(e_emb_dim + t_emb_dim + r_emb_dim, hidden_dim)

        # The linear layer that maps from hidden state space to to tags
        self.linear1 = nn.Linear(hidden_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, tagset_size)

    def forward(self, paths):      
        #transpose, so entities 1st row, types 2nd row, and relations 3nd (these are dim 1 and 2 since batch is 0)
        #this could just be the input if we want
        t_paths = torch.transpose(paths, 1, 2)
        
        #then concatenate embeddings, batch is index 0, so selecting along index 1
        entity_embed = self.entity_embeddings(t_paths[:,0,:])
        type_embed = self.type_embeddings(t_paths[:,1,:])
        rel_embed = self.rel_embeddings(t_paths[:,2,:])
        triplet_embed = torch.cat((entity_embed, type_embed, rel_embed), 2) #concatenates lengthwise
        
        #we need dimensions to be input size x batches x embedding dim
        #currently assuming all paths are same length
        lstm_out, last_state = self.lstm(triplet_embed.view(len(paths[0]), len(paths), -1))
        
        # Retrieve the final hidden state info, [-1,:,:], since we want output from last input item
        tag_score = self.linear2(F.relu(self.linear1(lstm_out[-1,:,:])))
        
        #Paper uses relu as final activation, but for Pytorch's nllloss it seems like we need a softmax layer
        #to convert to probability distribution?
        #return F.relu(tag_score)
        return F.log_softmax(tag_score, dim=1)
      

In [8]:
#For now just construct example, later would want to automatically create maps from vocab
e_to_ix = {'Sam': 0, 'Weijia': 1, 'Rosa': 2, 'Joey':3, 'Song1': 4, 'Song2': 5, 'Song3': 6, 'Pop': 7}
t_to_ix = {'u': 0, 's': 1, 't': 2}
r_to_ix = {'rate': 0, 'category': 1, 'belong': 2, '_rate': 3, '_category': 4, '_belong':5, 'UNK': 6}

#TODO: decide on actual path data format
#this could be transposed to [[entity1, entity2, ...], [type1, type2, ...], [rel1, rel2, ...]]
#since we do that in the model
training_data = [
    ([['Sam', 'u', 'rate'], ['Song1', 's', 'category'], ['Pop', 't', '_belong'], ['Song2', 's', 'UNK']], 1),
    ([['Sam', 'u', 'rate'], ['Song1', 's', '_rate'], ['Joey', 'u', 'rate'],['Song3', 's', 'UNK']], 0)
]

#construct tensor of item, type, and relation ids
def prepare_path(seq, e_to_ix, t_to_ix, r_to_ix):
    id_pairs = []
    for step in seq:
        e,t,r = step[0], step[1], step[2]
        id_pairs.append([len(t_to_ix) + len(r_to_ix) + e_to_ix[e], len(r_to_ix) + t_to_ix[t], r_to_ix[r]])
    
    return torch.tensor(id_pairs, dtype=torch.long)

print(training_data[1][0])
print(prepare_path(training_data[0][0], e_to_ix, t_to_ix, r_to_ix))

formatted_data = []
for path, tag in training_data:
    formatted_data.append((prepare_path(path, e_to_ix, t_to_ix, r_to_ix), tag))
print(formatted_data)

[['Sam', 'u', 'rate'], ['Song1', 's', '_rate'], ['Joey', 'u', 'rate'], ['Song3', 's', 'UNK']]
tensor([[10,  7,  0],
        [14,  8,  1],
        [17,  9,  5],
        [15,  8,  6]])
[(tensor([[10,  7,  0],
        [14,  8,  1],
        [17,  9,  5],
        [15,  8,  6]]), 1), (tensor([[10,  7,  0],
        [14,  8,  3],
        [13,  7,  0],
        [16,  8,  6]]), 0)]


In [9]:
E_EMBEDDING_DIM = 6 #64 in paper
T_EMBEDDING_DIM = 6 #32 in paper
R_EMBEDDING_DIM = 6 #32 in paper
HIDDEN_DIM = 6 #this might be unit number = 256
TARGET_SIZE = 2

vocab_size = len(e_to_ix) + len(t_to_ix) + len(r_to_ix)
model = PathLSTM(E_EMBEDDING_DIM, T_EMBEDDING_DIM, R_EMBEDDING_DIM, HIDDEN_DIM, vocab_size, TARGET_SIZE)
loss_function = nn.NLLLoss() #negative log likelihood loss
#loss_function = nn.CrossEntropyLoss() #This seems to work with relu activation but nllloss does not
#this is because crossEntropyLoss actually automatically adds the softmax layer to normalize results into p-distribution


# l2 regularization is tuned from {10−5 , 10−4 , 10−3 , 10−2 }, I think this is weight decay
# Learning rate is found from {0.001, 0.002, 0.01, 0.02} with grid search
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=.001)

#DataLoader used for batches
train_loader = DataLoader(dataset=formatted_data, batch_size=1, shuffle=False)

for epoch in range(300):  # tiny data so 300 epochs
    for path_batch, target_batch in train_loader:   
        #Pytorch accumulates gradients, so we need to clear before each instance
        model.zero_grad()

        #Run the forward pass.
        tag_scores = model(path_batch)

        #Compute the loss, gradients, and update the parameters by calling .step()
        loss = loss_function(tag_scores, target_batch)
        loss.backward()
        optimizer.step()

        # print statistics
        print("loss is:", loss.item())
        

loss is: 0.7014683485031128
loss is: 0.7388348579406738
loss is: 0.5947420597076416
loss is: 0.695090115070343
loss is: 0.5296273827552795
loss is: 0.6481267213821411
loss is: 0.47548192739486694
loss is: 0.6101570129394531
loss is: 0.4242744445800781
loss is: 0.5779571533203125
loss is: 0.36931928992271423
loss is: 0.539128839969635
loss is: 0.3117123246192932
loss is: 0.5211480259895325
loss is: 0.25671860575675964
loss is: 0.5027366876602173
loss is: 0.20797215402126312
loss is: 0.481269508600235
loss is: 0.16375774145126343
loss is: 0.4568486511707306
loss is: 0.12568672001361847
loss is: 0.4299483001232147
loss is: 0.09451020509004593
loss is: 0.40103062987327576
loss is: 0.0701766312122345
loss is: 0.37055596709251404
loss is: 0.05197293311357498
loss is: 0.339013934135437
loss is: 0.038773197680711746
loss is: 0.30698487162590027
loss is: 0.029356908053159714
loss is: 0.27517032623291016
loss is: 0.022656399756669998
loss is: 0.2443258911371231
loss is: 0.017853859812021255
loss

loss is: 0.0026326067745685577
loss is: 0.0015585192013531923
loss is: 0.0026177444960922003
loss is: 0.001553758280351758
loss is: 0.002603120170533657
loss is: 0.0015491163358092308
loss is: 0.0025888520758599043
loss is: 0.0015444743912667036
loss is: 0.002574583748355508
loss is: 0.0015399513067677617
loss is: 0.002560672117397189
loss is: 0.0015355474315583706
loss is: 0.0025468789972364902
loss is: 0.0015311434399336576
loss is: 0.002533323597162962
loss is: 0.0015267394483089447
loss is: 0.002519886940717697
loss is: 0.0015224544331431389
loss is: 0.002506688004359603
loss is: 0.0015182883944362402
loss is: 0.0024937265552580357
loss is: 0.0015140033792704344
loss is: 0.0024808840826153755
loss is: 0.0015099564334377646
loss is: 0.0024682790972292423
loss is: 0.001505909371189773
loss is: 0.0024557928554713726
loss is: 0.0015018623089417815
loss is: 0.0024434253573417664
loss is: 0.0014978153631091118
loss is: 0.0024312958121299744
loss is: 0.0014940063701942563
loss is: 0.00241

In [10]:
# See what the scores are after training, on the training dataset
with torch.no_grad():
    test_loader = DataLoader(dataset=formatted_data, batch_size=2, shuffle=False)
    for path_batch, target_batch in train_loader:
        tag_scores = model(path_batch)
        print(tag_scores)

tensor([[-6.7319e+00, -1.1930e-03]])
tensor([[-1.4863e-03, -6.5122e+00]])
