## This contains the code for getting the BiLSTM embeddings of a sentence or document

In [3]:
import torch 
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.nn.utils.rnn as rnn_utils

import math
from nltk import word_tokenize

In [4]:
sentences = ['I like pizza', 'but I hate sushi', 'I am hungry']

In [3]:
class Dictionary:
    def __init__(self):
        self.word2index = {}
        self.index2word = {}
        
    def add_word(self, word):
        if word not in self.word2index:
            idx = len(self.word2index)
            self.word2index[word] = idx
            self.index2word[idx] = word
        
    def add_sentences(self, sentences):
        for sent in sentences:
            words = word_tokenize(sent)
            for word in words:
                word = word.lower()
                
                self.add_word(word)

In [4]:
dictionary = Dictionary()
dictionary.add_sentences(sentences)


In [5]:
def get_indices(sentences, dictionary):
    sent_list = []
    for sent in sentences:
        words = word_tokenize(sent)
    
        sent_indices = []
        for word in words:
            word = word.lower()
            sent_indices.append(dictionary.word2index[word])
        sent_list.append(sent_indices)
    return sent_list

def batchify(data, batch_size=2, use_cuda=False):
    nbatch = math.ceil(len(data)/batch_size)
    batches = []
    
    def list2batch(sent_list):
        b_size = len(sent_list)
        maxlen = max([len(x) for x in sent_list])
        input_tensor = torch.LongTensor(maxlen, b_size).fill_(0)
        for idx, s in enumerate(sent_list):
            input_tensor[:len(s), idx] = torch.LongTensor(s)
        if use_cuda:
            input_tensor = input_tensor.cuda()
        return input_tensor
    
    for b_id in range(nbatch):
        b_data = data[(b_id * batch_size) : (b_id+1) * batch_size ]
        input_tensor = list2batch(b_data)
        batches.append(input_tensor)
    return batches

sent_idx_data = get_indices(sentences, dictionary)
print(sent_idx_data)
batches = batchify(sent_idx_data)
print(batches)

In [19]:
class BLSTMEncoder(nn.Module):

    def __init__(self, batch_size, word_emb_dim, encoder_dim, vocab_size, num_judges, num_layers=1, dropout=0.3):
        super(BLSTMEncoder, self).__init__()
        self.batch_size = batch_size
        self.word_emb_dim = word_emb_dim
        self.enc_lstm_dim = encoder_dim
        self.pool_type = 'max'
        self.dpout_model = dropout
        self.num_layers = num_layers
        self.drop = nn.Dropout(dropout)

        self.embed = nn.Embedding(vocab_size, word_emb_dim)
        self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
                                bidirectional=True, batch_first = True,dropout=self.dpout_model)
        self.fc = nn.Linear(self.enc_lstm_dim*2, num_judges)  # 2 for bidirection
    
        #self.init_embedding()


    def forward(self, x, evaluation_mode = False):
        # Set initial states
        memory_states = (Variable(torch.zeros(self.num_layers*2, len(x), self.enc_lstm_dim), requires_grad=evaluation_mode),
              Variable(torch.zeros(self.num_layers*2, len(x), self.enc_lstm_dim), requires_grad=evaluation_mode))
        
        emb = self.embed(Variable(x, requires_grad=evaluation_mode)) #get word embedding
        emb = self.drop(emb)
        # Forward propagate LSTM
        out, hidden = self.enc_lstm(emb, memory_states)
        
        
        # max pooling
        out = torch.max(out, 0)[0]
        
        # classification of judge
        out = self.fc(out)
        
        return out
    
    def init_embedding(self):
        initrange = 0.1
        self.enc_lstm.weight.data.uniform_(-initrange, initrange)
        self.enc_lstm.bias.data.fill_(0)


In [20]:
# initialize the BiLSTM model
BATCH_SIZE = len(batches[-1])
WORD_EMB_DIM = 5
ENCODER_DIM = 10
VOCAB_SIZE = len(dictionary.word2index)
JUDGE_NUM = 100
model = BLSTMEncoder(BATCH_SIZE, WORD_EMB_DIM, ENCODER_DIM, VOCAB_SIZE, JUDGE_NUM)

## checking if the model is producing the embedding
# sent_output = model(batches[-1])

In [21]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=learning_rate)
    
# Train the Model 
for epoch in range(num_epochs):
    for i, batch in enumerate(batches):
        # Forward + Backward + Optimize
        optimizer.zero_grad()
        outputs = model(batch)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [%d/%d], Step [%d/%d], Loss: %.4f' 
                   %(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0]))


Variable containing:

Columns 0 to 9 
-0.0075  0.1641  0.0566  0.2765  0.1459 -0.0716  0.1065  0.1346  0.0459 -0.0001

Columns 10 to 19 
 0.0680  0.0038  0.3045  0.1725  0.0591  0.1864  0.1845  0.0679  0.2022  0.1409
[torch.FloatTensor of size 1x20]