In [1]:
import numpy as np
import glob
import os
import sys
import time
import datetime
import io

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.nn.utils import clip_grad_norm_

import zipfile
import tarfile
import gzip
import logging
from collections import Counter

from sklearn import metrics

In [2]:
gpu_id = 0 #select gou

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)  

if torch.cuda.is_available():
    print('cuda available')
    dtypeFloat = torch.cuda.FloatTensor
    dtypeLong = torch.cuda.LongTensor
    
else:
    print('cuda not available')
    gpu_id = -1
    dtypeFloat = torch.FloatTensor
    dtypeLong = torch.LongTensor
   
    

cuda available


In [3]:
def load_data(in_file, max_example=None, relabeling=True):

    documents = []
    questions = []
    answers = []
    num_examples = 0
    f = open(in_file, 'r', encoding='utf-8')
    while True:
        line = f.readline()
        if not line:
            break
        question = line.strip().lower()
        answer = f.readline().strip()
        document = f.readline().strip().lower()

        if relabeling:
            q_words = question.split(' ')
            d_words = document.split(' ')
            assert answer in d_words

            entity_dict = {}
            entity_id = 0
            for word in d_words + q_words:
                if (word.startswith('@entity')) and (word not in entity_dict):
                    entity_dict[word] = '@entity' + str(entity_id)
                    entity_id += 1

            q_words = [entity_dict[w] if w in entity_dict else w for w in q_words]
            d_words = [entity_dict[w] if w in entity_dict else w for w in d_words]
            answer = entity_dict[answer]

            question = ' '.join(q_words)
            document = ' '.join(d_words)

        questions.append(question)
        answers.append(answer)
        documents.append(document)
        num_examples += 1

        f.readline()
        if (max_example is not None) and (num_examples >= max_example):
            break
    f.close()
    
    print('#Examples: %d' % len(documents))
    return (documents, questions, answers)

In [4]:
def build_dict(sentences, max_words=120000):
    """
        Build a dictionary for the words in `sentences`.
        Only the max_words ones are kept and the remaining will be mapped to <UNK>.
    """
    word_count = Counter()
    for sent in sentences:
        for w in sent.split(' '):
            word_count[w] += 1

    ls = word_count.most_common(max_words)
    print('#Words: %d -> %d' % (len(word_count), len(ls)))
    for key in ls[:5]:
        print(key)
    print('...')
    for key in ls[-5:]:
        print(key)

    # leave 0 to UNK
    # leave 1 to delimiter |||
    
    vocab_dict = {w[0]: index + 2 for (index, w) in enumerate(ls)}
    vocab_dict['<UNK>'] = 0
    vocab_dict['<PAD>'] = 1
    
    
    return vocab_dict

In [5]:
def vectorize(examples, word_dict, entity_dict):
    """
        Vectorize `examples`.
        in_x1, in_x2: sequences for document and question respecitvely.
        in_y: label
        in_l: whether the entity label occurs in the document.
    
    """
    in_x1 = []
    in_x2 = []
    in_l = np.zeros((len(examples[0]), len(entity_dict)))#.astype(config._floatX)
    in_y = []
    
    max_d = 0
    max_q = 0
    
    
    for idx, (d, q, a) in enumerate(zip(examples[0], examples[1], examples[2])):
        d_words = d.split(' ')
        q_words = q.split(' ')
        assert (a in d_words)
        seq1 = [word_dict[w] if w in word_dict else 0 for w in d_words]
        seq2 = [word_dict[w] if w in word_dict else 0 for w in q_words]
        
        if max_d < len(seq1):
            max_d = len(seq1)
        if max_q < len(seq2):
            max_q = len(seq2)
        
        if (len(seq1) > 0) and (len(seq2) > 0):
            in_x1.append(seq1)
            in_x2.append(seq2)
            in_l[idx, [entity_dict[w] for w in d_words if w in entity_dict]] = 1.0
            in_y.append(entity_dict[a] if a in entity_dict else 0)
        if (idx % 10000 == 0):
            print('Vectorization: processed %d / %d' % (idx, len(examples[0])))


    

    return in_x1, in_x2, in_l, in_y, max_d, max_q

In [6]:
def get_minibatches(n, minibatch_size, shuffle=False):
    idx_list = np.arange(0, n, minibatch_size)
    if shuffle:
        np.random.shuffle(idx_list)
    minibatches = []
    for idx in idx_list:
        minibatches.append(np.arange(idx, min(idx + minibatch_size, n)))
    return minibatches

In [7]:
def prepare_data(seqs, max_l):
    lengths = [len(seq) for seq in seqs]
    n_samples = len(seqs)
    
    x = np.zeros((n_samples, max_l)).astype('int32')

    for idx, seq in enumerate(seqs):
        x[idx, :lengths[idx]] = seq
        
    return x, np.array(lengths)


In [8]:
def gen_examples(x1, x2, l, y, batch_size, max_d, max_q):
    """
        Divide examples into batches of size `batch_size`.
    """
    minibatches = get_minibatches(len(x1), batch_size)
    all_ex = []
    for minibatch in minibatches:
        mb_x1 = [x1[t] for t in minibatch]
        mb_x2 = [x2[t] for t in minibatch]
        #mb_l = l[minibatch]
        mb_y = [y[t] for t in minibatch]
        mb_y = np.array(mb_y)
        mb_x1, x1_len = prepare_data(mb_x1, max_d)
        mb_x2, x2_len = prepare_data(mb_x2, max_q)
        all_ex.append((mb_x1, x1_len, mb_x2, x2_len, mb_y))
    return all_ex

In [None]:
def gen_embeddings_glove(word_dict, dim, in_file=None):
    """
        Generate an initial embedding matrix for `word_dict` from glove pretrained embeddings.
        If an embedding file is not given or a word is not in the embedding file,
        a randomly initialized vector will be used.
    """
    if os.path.isfile(in_file)==False:
        print('Glove pretrained embedding missing.. Downloading...')
        if not os.path.exists('./data/glove/'):
            os.makedirs('./data/glove/')
        os.system('wget http://nlp.stanford.edu/data/glove.6B.zip -P ./data/glove/')
        zip_ref = zipfile.ZipFile('./data/glove/glove.6B.zip', 'r')
        zip_ref.extractall('./data/glove/')
        zip_ref.close()
        
    
    num_words = len(word_dict) + 2
    embeddings = np.random.uniform(size=(num_words, dim))
    print('Embeddings: %d x %d' % (num_words, dim))

    if in_file is not None:
        print('Loading embedding file: %s' % in_file)
        pre_trained = 0
        for line in open(in_file, encoding='utf-8').readlines():
            sp = line.split()
            
            #print("Length = ",len(sp))
            
            assert len(sp) == dim + 1 
            if sp[0] in word_dict:
                pre_trained += 1
                embeddings[word_dict[sp[0]]] = [float(x) for x in sp[1:]]
        print('Pre-trained: %d (%.2f%%)' %
              (pre_trained, pre_trained * 100.0 / num_words))
    return embeddings

def gen_embeddings_fasttext(word_dict, dim, in_file=None):
    """
        Generate an initial embedding matrix for `word_dict`from fasttext pretrained embeddings.
        If an embedding file is not given or a word is not in the embedding file,
        a randomly initialized vector will be used.
    """
    if os.path.isfile(in_file)==False:
        print('Fasttext pretrained embedding missing.. Downloading...')
        if not os.path.exists('./data/fasttext/'):
            os.makedirs('./data/fasttext/')
        os.system('wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.vec -P ./data/fasttext/')

    num_words = len(word_dict) + 2
    embeddings = np.random.uniform(size=(num_words, dim))
    print('Embeddings: %d x %d' % (num_words, dim))

    if in_file is not None:
        print('Loading fasttext embedding file: %s' % in_file)
        pre_trained = 0
        with io.open(in_file, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
            for i, line in enumerate(f):
                if i == 0:
                    split = line.split()
                    assert len(split) == 2
                    #assert _emb_dim_file == int(split[1])
                else:
                    word, vect = line.rstrip().split(' ', 1)        
                    if word in word_dict:
                        pre_trained += 1
                        vect = np.fromstring(vect, sep=' ')
                        #print(vect.shape)
                        embeddings[word_dict[word]] = vect
        print('Pre-trained: %d (%.2f%%)' % (pre_trained, pre_trained * 100.0 / num_words))
    return embeddings

def gen_embeddings_random(word_dict, dim):
    """
        Generate an initial embedding matrix for `word_dict` using uniform random.
    """
    num_words = len(word_dict) + 2
    embeddings = np.random.uniform(size=(num_words, dim))
    print('Embeddings: %d x %d' % (num_words, dim))
    return embeddings

def gen_embeddings(word_dict, dim, in_file=None, init_from='random'):
    """
        Generate an initial embedding matrix for `word_dict`
    """
    if init_from == 'glove':
        return gen_embeddings_glove(word_dict, dim, in_file)
    elif init_from == 'fasttext':
        return gen_embeddings_fasttext(word_dict, dim, in_file)
    else:
        return gen_embeddings_random(word_dict, dim)

### Basic LSTM Model 

Since the data to be processed sequentially as a baseline we thought of testing with a bi-directional LSTM model as it is proven good for sequential modelling.

In this model the tokens of the sentences are embedded and the imbedded tokens for the document related to the questions are sent through a bi-directional LSTM cell and then those for the question are send. Afterwards the final outputs of the final token of the document sequence and that of the question sequence are summed up together to produce the final output answer. These two quantities were summed together in order to get a connection between the document and the question for the prediction of final answer.

In [11]:
class Basic_LSTM_model(nn.Module):

    def __init__(self, embeddings, hidden_size, output_size, num_layers=1, drp_rate=0.25):
        super(Basic_LSTM_model, self).__init__()
        
        self.vocab_size = embeddings.shape[0]
        self.embedding_size = embeddings.shape[1]
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.drp = drp_rate
        
        weight = torch.Tensor(embeddings)
        self.embeddings = nn.Embedding.from_pretrained(weight, freeze=False)
            
        self.context_lstm = nn.LSTM(embedding_size, hidden_size, num_layers, 
                          batch_first=True, bidirectional=True, dropout=self.drp)
        
        self.linear = nn.Linear(hidden_size*2, output_size)

        
    def forward(self, doc_x, ques_x, doc_seq_lengths, ques_seq_lengths):   
        '''
        doc_x: torch tensor. size - (batch_size, doc_seq_len)
        ques_x: torch tensor. size - (batch_size, ques_seq_len)
        doc_seq_lengths: 1d numpy array containing lengths of each document in doc_x
        ques_seq_lengths: 1d numpy array containing lengths of each question in ques_x
        
        '''
        
        def contextual_embedding(data, seq_lengths):
            
            # Sort by length (keep idx)
            seq_lengths, idx_sort = np.sort(seq_lengths)[::-1], np.argsort(-seq_lengths)
            idx_original = np.argsort(idx_sort)
            idx_sort = torch.from_numpy(idx_sort).type(dtypeLong)
            data = data.index_select(0, idx_sort)

            packed_input = pack_padded_sequence(data, seq_lengths, batch_first=True)
            packed_output, (hidden, c) = self.context_lstm(packed_input)
            output, _ = pad_packed_sequence(packed_output, batch_first=True)
            #print("out: ", output.size(), " hid: ", hidden.size())

            #Un-sort by length
            idx_original = torch.from_numpy(idx_original).type(dtypeLong)
            output = output.index_select(0, idx_original)
            hidden = hidden.index_select(1, idx_original)
            c = c.index_select(1, idx_original)
            
            return output, hidden, c

        doc_data = self.embeddings(doc_x) # doc_data shape: (batch_size, doc_seq_len, embedding_dim)
        ques_data = self.embeddings(ques_x) # ques_data shape: (batch_size, ques_seq_len, embedding_dim)
         
        ##For Documents/questions
        doc_output, doc_hidden, doc_c = contextual_embedding(doc_data, doc_seq_lengths)
        ques_output, ques_hidden, ques_c = contextual_embedding(ques_data, ques_seq_lengths)
        # output shape: (batch_size, seq_len, hidden_size * num_directions)
        # hidden shape: (num_layers * num_directions, batch_size, hidden_size)
        
        #Obtaining the final output of documents
        docs_fwd_h = doc_hidden[0:doc_hidden.size(0):2]
        docs_bwd_h = doc_hidden[1:doc_hidden.size(0):2]
        docs_hidden = torch.cat([docs_fwd_h, docs_bwd_h], dim=2) 
        #docs_hidden shape:  torch.Size([1, bs, 256])
        
        #Obtaining the final output of questions
        ques_fwd_h = ques_hidden[0:ques_hidden.size(0):2]
        ques_bwd_h = ques_hidden[1:ques_hidden.size(0):2]
        ques_hidden = torch.cat([ques_fwd_h, ques_bwd_h], dim=2) 
        #ques_hidden shape:  torch.Size([1, bs, 256])
        
        final_op = docs_hidden.squeeze() + ques_hidden.squeeze() 
        #final_op shape:  torch.Size([bs, 256])
        
        logits = self.linear(final_op) #logits shape:  torch.Size([bs, numClasses])
        #print("logits shape: ", logits.size())
        
        return logits
    
    def loss(self, y, y_target):
        
        loss = nn.CrossEntropyLoss()(y,y_target)
        
        return loss


    def update(self, lr):
                
        update = torch.optim.SGD( self.parameters(), lr=lr )
        
        return update
    
    def update_learning_rate(self, optimizer, lr):
   
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr

        return optimizer

In [12]:
def train_one_epoch(net,optimizer,tr_data):
    
    net.train()
    
    run_loss = 0
    run_nb_data = 0
    run_acc = 0
    
    
    np.random.shuffle(tr_data)
    
    for ids, (mb_x1, x1_len, mb_x2, x2_len, mb_y) in enumerate(tr_data):
        
        
        x1 = Variable( torch.LongTensor(mb_x1).type(dtypeLong) , requires_grad=False)
        x2 = Variable( torch.LongTensor(mb_x2).type(dtypeLong) , requires_grad=False)
        target = Variable( torch.LongTensor(mb_y).type(dtypeLong) , requires_grad=False)
        
        optimizer.zero_grad()
        
        pred = net.forward(x1,x2,x1_len,x2_len)
        
        loss = net.loss(pred,target)
        loss.backward()
        optimizer.step()
        
        run_loss += (loss.detach().item())*float(mb_x1.shape[0])
        run_nb_data += float(mb_x1.shape[0])
        
        
        _, batch_predicted_labels = torch.max(pred, dim=1) 
        
        
        acc = metrics.accuracy_score(np.array(target), batch_predicted_labels.cpu().numpy())
        run_acc += acc*float(mb_x1.shape[0])
        
        
    acc_tr = run_acc/run_nb_data
    loss_tr = run_loss/run_nb_data
    
        
        
    
        
    return loss_tr, acc_tr
        
    
    

In [13]:
def test_one_epoch(net,tst_data):
    
    net.eval()
    
    run_loss = 0
    run_nb_data = 0
    run_acc = 0
    
    for ids, (mb_x1, x1_len, mb_x2, x2_len, mb_y) in enumerate(tst_data):
        
        
        x1 = Variable( torch.LongTensor(mb_x1).type(dtypeLong) , requires_grad=False)
        x2 = Variable( torch.LongTensor(mb_x2).type(dtypeLong) , requires_grad=False)
        target = Variable( torch.LongTensor(mb_y).type(dtypeLong) , requires_grad=False)
        
        
        pred = net.forward(x1,x2,x1_len,x2_len)
        
        loss = net.loss(pred,target)
        
        run_loss += (loss.detach().item())*float(mb_x1.shape[0])
        run_nb_data += float(mb_x1.shape[0])
        
        _, batch_predicted_labels = torch.max(pred, dim=1) 
        
        
        acc = metrics.accuracy_score(np.array(target), batch_predicted_labels.cpu().numpy())
        run_acc += acc*float(mb_x1.shape[0])
        
        
    acc_dev = run_acc/run_nb_data
    loss_dev = run_loss/run_nb_data
    
    return loss_dev, acc_dev
    
    
    

In [None]:
#Check and prepare data set

def check_CNN_dataset_exists(path_data='./data/'):
    flag_train_data = os.path.isfile(path_data + 'cnn/train.txt')  
    flag_test_data = os.path.isfile(path_data + 'cnn/test.txt') 
    flag_dev_data = os.path.isfile(path_data + 'cnn/dev.txt') 
    if flag_train_data==False or flag_test_data==False or flag_dev_data==False:
        print('CNN dataset missing - downloading...')
        if not os.path.exists(path_data):
            os.makedirs(path_data)
        url = "http://cs.stanford.edu/~danqi/data/cnn.tar.gz"
        os.system('wget http://cs.stanford.edu/~danqi/data/cnn.tar.gz -P ./data/')
        tar = tarfile.open('./data/cnn.tar.gz', "r:gz")
        tar.extractall('./data/')
        tar.close()
    else:
        print("CNN dataset is already there!")
        

check_CNN_dataset_exists()

In [14]:
#Loading data


fin_train = "./data/cnn/train.txt" #path of the training data text file.
fin_dev = "./data/cnn/dev.txt"     #path of the validation data text file.


train_exps = load_data(fin_train, relabeling=True)
dev_exps = load_data(fin_dev, relabeling=True)

#Examples: 380298
#Examples: 3924


In [15]:
#Building dictionaries


word_dict = build_dict(train_exps[0] + train_exps[1])
entity_markers = list(set([w for w in word_dict.keys() if w.startswith('@entity')] + train_exps[2]))
entity_markers = ['<unk_entity>'] + entity_markers
entity_dict = {w: index for (index, w) in enumerate(entity_markers)}

#Words: 118432 -> 118432
('the', 15383021)
(',', 13757778)
('.', 11782121)
('to', 7208903)
('"', 6967510)
...
('slingers', 1)
('multi-planet', 1)
('johnstons', 1)
('shir', 1)
('khurma', 1)


In [21]:
#Generating Embeddings

embedding_size = 300

# embedding can be initialized from glove or fastest pretrained embeddings or from random ones.

embeddings = gen_embeddings(word_dict, embedding_size, in_file='./data/glove/glove.6B.300d.txt', init_from='glove')
#embeddings = gen_embeddings(word_dict, embedding_size, in_file='./data/fasttext/wiki.en.vec', init_from='fasttext')
#embeddings = gen_embeddings(word_dict, embedding_size, in_file=None, init_from='random')

Embeddings: 118436 x 300
Loading embedding file: glove_embed/glove.42B.300d.txt
Pre-trained: 102079 (86.19%)


In [22]:
#Vectorizing and minibatch creation

trn_x1, trn_x2, trn_l, trn_y, max_d, max_q = vectorize(train_exps, word_dict, entity_dict)
train_data = gen_examples(trn_x1, trn_x2, trn_l, trn_y, 128, max_d, max_q)


dev_x1, dev_x2, dev_l, dev_y, _, _ = vectorize(dev_exps, word_dict, entity_dict)
dev_data = gen_examples(dev_x1, dev_x2, dev_l, dev_y, 128, max_d, max_q)

Vectorization: processed 0 / 380298
Vectorization: processed 10000 / 380298
Vectorization: processed 20000 / 380298
Vectorization: processed 30000 / 380298
Vectorization: processed 40000 / 380298
Vectorization: processed 50000 / 380298
Vectorization: processed 60000 / 380298
Vectorization: processed 70000 / 380298
Vectorization: processed 80000 / 380298
Vectorization: processed 90000 / 380298
Vectorization: processed 100000 / 380298
Vectorization: processed 110000 / 380298
Vectorization: processed 120000 / 380298
Vectorization: processed 130000 / 380298
Vectorization: processed 140000 / 380298
Vectorization: processed 150000 / 380298
Vectorization: processed 160000 / 380298
Vectorization: processed 170000 / 380298
Vectorization: processed 180000 / 380298
Vectorization: processed 190000 / 380298
Vectorization: processed 200000 / 380298
Vectorization: processed 210000 / 380298
Vectorization: processed 220000 / 380298
Vectorization: processed 230000 / 380298
Vectorization: processed 24000

In [23]:
#Model parameters

lr = 0.1
decay_rate = 1.2
dev_loss_old = 1000000
epochs_no = 20
drp = 0.25



In [24]:
#Building model


net = Basic_LSTM_model(embeddings=embeddings, hidden_size=128, output_size=len(entity_dict),drp_rate=drp)
if torch.cuda.is_available():
    net.cuda()
opt = net.update(lr)

In [None]:
# save results in a .txt file
time_stamp = datetime.datetime.now().strftime("%y-%m-%d--%H-%M-%S")
file_name = 'logs'+'/'+time_stamp + "unidr" + ".txt"
file = open(file_name,"w",1) 
file.write(time_stamp+'\n\n') 
mystr = "lstm_basic_sgd"
file.write(mystr+'\n\n')

start = time.time()

for epoch in range(epochs_no):
    
    start_epoch = time.time()
    
    tr_loss, tr_acc = train_one_epoch(net,opt,train_data)
    
    dev_loss, dev_acc = test_one_epoch(net,dev_data)
    
    
    # update learning rate 
    if dev_loss > 0.99* dev_loss_old:
        lr /= decay_rate
    opt = net.update_learning_rate(opt, lr)
    dev_loss_old = dev_loss
    
    ep_details = ( 'Epoch {EP} [epoch length:{epoch_time:.0f}s | time from start:{fromstart:.1f}h] \t'
                   'lr={LR:.2e}\t'
                   'loss={train_loss:.3f}/{test_loss:.3f}\t'
                   'acc:{train_acc:.3f}/{test_acc:.3f} '.format(
                    EP=epoch,
                    epoch_time=time.time()-start_epoch,
                    fromstart=(time.time()-start)/3600,          
                    LR= lr,  
                    train_loss=tr_loss, test_loss=dev_loss,
                    train_acc=tr_acc, test_acc=dev_acc ) )
    
    
    
    
    file.write(ep_details+'\n')
    print(ep_details)
    
file.close()
    

In [28]:
fin_test = "./data/cnn/test.txt"   #path of the testing data text file.
test_exps = load_data(fin_test, relabeling=True)

tst_x1, tst_x2, tst_l, tst_y, max_d, max_q = vectorize(test_exps, word_dict, entity_dict)
test_data = gen_examples(tst_x1, tst_x2, tst_l, tst_y, 128, max_d, max_q)

#Examples: 3198
Vectorization: processed 0 / 3198


In [29]:
tst_loss, tst_acc = test_one_epoch(net,test_data)
print("Test loss and accuracy = ",tst_loss,"\t",tst_acc,)

Test loss and accuracy =  1.8046578242675895 	 0.46153846153846156


### Results

After 20 epochs with 0.25 drop-out rate the training/validation/testing accuracies obtained were 0.460/0.442/0.462 respectively.

From these results it is evident that a basic LSTM module itself is not sufficiently powerful enough to capture and process the relations between the document and the question to predict the answer.