# LSTM based Language Model with Beam Search
Abhimanyu Talwar

In [0]:
import torch
import torch.nn as nn
import torchtext
from torchtext.vocab import Vectors
from torchtext.datasets import WikiText2

from torchtext.data.iterator import BPTTIterator
from torchtext.data import Batch, Dataset
import math

import sys
import time

import numpy as np

# Download Dataset & Create Train, Validation, Test Sets

In [0]:
# set up fields
TEXT = torchtext.data.Field(lower=True)

In [3]:
# make splits for data
train, val, test = WikiText2.splits(TEXT)

downloading wikitext-2-v1.zip


wikitext-2-v1.zip: 100%|██████████| 4.48M/4.48M [00:00<00:00, 17.5MB/s]


extracting


In [4]:
for set_, name_ in zip([train, val, test], ['Train', 'Validation', 'Test']):
    print('Length of {} Set is {}'.format(name_, len(set_)))

Length of Train Set is 1
Length of Validation Set is 1
Length of Test Set is 1


In [5]:
MAX_SIZE = 30000
if MAX_SIZE is None:
    TEXT.build_vocab(train)
else:
    TEXT.build_vocab(train, max_size=MAX_SIZE)
print('Vocabulary Length: ', len(TEXT.vocab))

Vocabulary Length:  28913


In [6]:
print('Pad Token:     ', TEXT.pad_token)
print('Unknown Token: ', TEXT.unk_token)
print('EOS Token:     ', TEXT.eos_token)

Pad Token:      <pad>
Unknown Token:  <unk>
EOS Token:      None


In [0]:
train_iter, val_iter, test_iter = BPTTIterator.splits((train, val, test), \
                               batch_size=32, device=torch.device("cuda"), bptt_len=30)

In [8]:
it = iter(train_iter)
batch = next(it) 
print("Size of text batch [max bptt length, batch size]", batch.text.shape)

Size of text batch [max bptt length, batch size] torch.Size([30, 32])


In [9]:
print('='*90)
print('Raw form of [batch.text]: ')
print(' '*15, batch.text[0,:])
print('='*90)
print('Text form of [batch.text]: ')
print(' '*15, ' '.join([TEXT.vocab.itos[i] for i in batch.text[:,0].cpu().numpy()]))
print('='*90)
print('Text form of [batch.target]: ')
print(' '*15, ' '.join([TEXT.vocab.itos[i] for i in batch.target[:,0].cpu().numpy()]))
print('='*90)

Raw form of [batch.text]: 
                tensor([    9,  1499,    57,    26,  1439,  7863,   159,   180,    28,   691,
          235,     8,  1288,    59, 13430,    47,    12,    85,     6,  5979,
        16940,  2492,    19,     3,     5,  4912,     4, 24526,    30,     7,
           23,    36], device='cuda:0')
Text form of [batch.text]: 
                <eos> = valkyria chronicles iii = <eos> <eos> senjō no valkyria 3 : <unk> chronicles ( japanese : 戦場のヴァルキュリア3 , lit . valkyria of the battlefield 3 ) , commonly
Text form of [batch.target]: 
                = valkyria chronicles iii = <eos> <eos> senjō no valkyria 3 : <unk> chronicles ( japanese : 戦場のヴァルキュリア3 , lit . valkyria of the battlefield 3 ) , commonly referred


# Declare Model and Train

## Functions for predicting last word and generating a random sentence  

In [0]:
def PredictLastWord(model, sentence_txt):
    """
    INPUTS:
        sentence_txt:  string, a sentence whose last word we want to predict
                       e.g. 'he is eating a'
    OUTPUTS:
        predictions:   list, of top 20 words (strings) which could be the last
                       word of sentence_txt
        top_log_probs: list, of the log probabilities for the top 20 predicted
                       words
    """
    model.eval()
    sent_tokens = sentence_txt.split(' ')
    indices = [TEXT.vocab.stoi[sent_tokens[-i]] for i in range(len(sent_tokens), 0, -1)]
    #print(' '.join([TEXT.vocab.itos[i] for i in indices]))
    indices = np.expand_dims(np.asarray(indices), axis=1)
    inputs = torch.from_numpy(indices).to('cuda')
    log_probs, _ = model(inputs)
    top_log_probs, topidx = log_probs[-1,0,:].topk(20)
    predictions = [TEXT.vocab.itos[i] for i in topidx.cpu().numpy()]
    return top_log_probs, predictions

def GenerateRandom(model, seed_word, sentence_len=10):
    model.eval()
    bad_words = ['<eos>', '<unk>', '"', '.', '@-@', '@.@']
    bad_tokens = [TEXT.vocab.stoi[w] for w in bad_words]
    minus_inf = -1e4
    
    sentence_ = [seed_word]
    idx = TEXT.vocab.stoi[seed_word]
    lstm_states = None
    for i in range(sentence_len):
        idx = np.expand_dims(np.asarray([idx]), axis=1)
        inputs = torch.from_numpy(idx).to('cuda')

        log_probs, (h, c) = model(inputs, lstm_states=lstm_states)
        log_probs[:,:,bad_tokens] = minus_inf
        
        top_log_prob, topidx = log_probs[-1,0,:].topk(10)
        topidx = np.random.choice(topidx.cpu().numpy(), size=1)[0]
        
        new_word = TEXT.vocab.itos[topidx]
        sentence_ += [new_word]
        lstm_states = (h, c)
        idx = topidx
    return ' '.join(sentence_)

## Beam Search Implementation

In [0]:
def BeamSearch(model, seed_phrase, beam_size=100, sentence_len=10):
    """
    INPUTS:
        seed_phrase:  string, which denotes start of a sentence, 
                      e.g. 'he is a'
        sentence_len: integer, number of words to be generated in 
                      addition to the seed_phrase
        beam_size:    integer, size of the beam for Beam Search
    OUTPUT:
        result:       list, of beam_size number of sentences generated
                      using Beam Search
    """
    def IdxToTensor(indices):
        indices = np.expand_dims(np.asarray(indices), axis=1)
        tensor_idx = torch.from_numpy(indices).to('cuda')
        return tensor_idx
    
    
    minus_inf = -1e4
    model.eval()
    bad_words = ['<eos>', '<unk>', '.', '"', '@-@', '@.@']
    bad_tokens = [TEXT.vocab.stoi[w] for w in bad_words]
    
    lstm_states = None
    seed_words = seed_phrase.split()
    seed_indices = [TEXT.vocab.stoi[w] for w in seed_words]
    for word in seed_words:
        word_idx = TEXT.vocab.stoi[word]
        inputs = IdxToTensor([word_idx])
        log_probs, (h, c) = model(inputs, lstm_states=lstm_states)
        lstm_states = (h, c)
    
    log_probs[:,:,bad_tokens] = minus_inf
    
    top_log_probs, top_idx = log_probs.flatten().topk(beam_size)
    
    beam = [top_idx.tolist()]
    beam_data = [(seed_indices + [b], lstm_states) for b in beam[0]]
    running_log_probs = top_log_probs.unsqueeze(dim=1).unsqueeze(dim=0)
    
    for i in range(sentence_len):
        h_in, c_in = [], []
        for j, _ in enumerate(beam[0]):
            _, (h_, c_) = beam_data[j]
            h_in += [h_]
            c_in += [c_]

        h_in = torch.cat(h_in, dim=1)
        c_in = torch.cat(c_in, dim=1)
        
        inputs = torch.from_numpy(np.asarray(beam)).to('cuda')
        
        log_probs_out, (h_out, c_out) = model(inputs, (h_in, c_in))
        log_probs_out[:,:,bad_tokens] = minus_inf
        
        updated_log_probs = log_probs_out + running_log_probs
        
        if i < sentence_len - 1:
            running_log_probs, top_idx = updated_log_probs.flatten().topk(beam_size)
        else:
            running_log_probs, top_idx = updated_log_probs.flatten().topk(beam_size)
        
        running_log_probs = running_log_probs.unsqueeze(dim=1).unsqueeze(dim=0)
        
        word_indices = [w % len(TEXT.vocab) for w in top_idx.tolist()]
        beam_indices = [w // len(TEXT.vocab) for w in top_idx.tolist()]

        new_beam_data = []
        for b_i, w_i in zip(beam_indices, word_indices):
            new_beam_data += [(beam_data[b_i][0] + [w_i], (h_out[:,b_i:b_i+1,:], c_out[:,b_i:b_i+1,:]))]
        
        beam = [word_indices]
        beam_data = new_beam_data
    result = [' '.join([TEXT.vocab.itos[idx] for idx in x[0][0:sentence_len+len(seed_words)]]) for k, x in enumerate(beam_data)]
    return result

## Function for training a model

In [0]:
def TrainModel(model, criterion, optimizer, train_iter, val_iter, \
               scheduler=None, num_epochs=1, update_freq=1, grad_norm=0.1):
    
    def PrintStatus(prefix_, epoch_, batch_, num_batches_, train_loss, \
                    val_loss=None, epoch_time=None):
        epoch_str = 'Epoch {}'.format(epoch_)
        batch_str = 'Batch {}/{}'.format(batch_, num_batches_)
        offset_1 = ' '*(10 - len(epoch_str))
        offset_2 = ' '*(15 - len(batch_str)) + ':  '
        value_str = 'Train Loss {:.3f}'.\
                      format(train_loss)
        status_str = prefix_ + epoch_str + offset_1 + batch_str + offset_2 + value_str
        if val_loss is not None:
            status_str += ' Valid Loss {:.3f}'.format(val_loss)
        if epoch_time is not None:
            status_str += ' [Time {:.1f}s]'.format(epoch_time)
        sys.stdout.write(status_str)
    
    
    num_batches = len(train_iter)
    num_val_batches = len(val_iter)
    best_val_loss = None
    for epoch in range(num_epochs):
        t0 = time.time()
        model.train()
        if scheduler is not None:
            scheduler.step()
        running_loss = 0.0
        epoch_loss = 0.0
        for i, batch in enumerate(train_iter):
            inputs, labels = batch.text, batch.target
            inputs, labels = inputs.to('cuda'), labels.to('cuda')

            outputs, _ = model(inputs)
            
            seq_len_ = outputs.shape[0]
            batch_size_ = outputs.shape[1]
            loss = criterion(outputs.reshape(batch_size_*seq_len_, -1), \
                             labels.reshape(batch_size_*seq_len_, -1).squeeze())
            
            optimizer.zero_grad()
            loss.backward()
            
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_norm)
            
            optimizer.step()
            running_loss += loss.item()
            epoch_loss += loss.item()
            if i % update_freq == update_freq - 1:
                train_loss = running_loss/update_freq
                if i < update_freq:
                    prefix_ = '\n'
                else:
                    prefix_ = '\r'
                PrintStatus(prefix_, epoch+1, i+1, num_batches, train_loss, \
                            None, time.time() - t0)
                running_loss = 0.0
        # Calculate validation loss
        model.eval()
        val_loss = 0.0
        for j, batch in enumerate(val_iter):
            inputs, labels = batch.text, batch.target
            inputs, labels = inputs.to('cuda'), labels.to('cuda')

            outputs, _ = model(inputs)
            
            seq_len_ = outputs.shape[0]
            batch_size_ = outputs.shape[1]
            loss = criterion(outputs.reshape(batch_size_*seq_len_, -1), \
                             labels.reshape(batch_size_*seq_len_, -1).squeeze())
            val_loss += loss.item()
        val_loss = val_loss/num_val_batches
        if best_val_loss is None or val_loss < best_val_loss:
            torch.save(model.state_dict(), 'best_lm_model.pth')
            best_val_loss = val_loss
        PrintStatus('\r', epoch+1, num_batches, num_batches, epoch_loss/num_batches, \
                    val_loss, time.time() - t0)
        # Generate random sentence from Language Model
        sample_ = GenerateRandom(model, 'the')
        print('\nRandom sample: ', sample_)
        model.train()

## Model class

In [0]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, drop_rate=0.5):
        super(LSTMModel, self).__init__()
        self.dropout = nn.Dropout(drop_rate)
        self.embed_layer = nn.Embedding(num_embeddings=vocab_size, \
                                        embedding_dim=embedding_dim)
        # BPTTIterator generates samples in (SEQ, BATCH) order and that is
        # why batch_first is set to False below (it is the default setting too).
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, \
                           num_layers=num_layers, batch_first=False)
        self.linear = nn.Linear(in_features=hidden_dim, out_features=vocab_size)
        self.dropout2 = nn.Dropout(drop_rate)
        self.logsoft = nn.LogSoftmax(dim=2)
    
    def forward(self, x, lstm_states=None):
        x = self.embed_layer(x)
        x = self.dropout(x)
        if lstm_states is None:
            x, (h_n, c_n) = self.lstm(x)
        else:
            x, (h_n, c_n) = self.lstm(x, lstm_states)
        x = self.dropout2(x)
        x = self.linear(x)
        log_probs = self.logsoft(x)
        return log_probs, (h_n, c_n)

In [0]:
# Config
VOCAB_SIZE = len(TEXT.vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 1024
NUM_LAYERS = 2
UPDATE_FREQ = 250
NUM_EPOCHS = 100

In [0]:
model = LSTMModel(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, \
                      hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS)
model = model.to('cuda')
criterion = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1.0, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) 

In [16]:
TrainModel(model, criterion, optimizer, train_iter, val_iter, scheduler=scheduler, \
           update_freq=UPDATE_FREQ, num_epochs=NUM_EPOCHS, grad_norm=10.0)


Epoch 1   Batch 2176/2176:  Train Loss 6.252 Valid Loss 5.647 [Time 188.4s]
Random sample:  the 766th division and a number for its original area on

Epoch 2   Batch 2176/2176:  Train Loss 5.776 Valid Loss 5.450 [Time 193.9s]
Random sample:  the united league , as well to the first year in

Epoch 3   Batch 2176/2176:  Train Loss 5.607 Valid Loss 5.351 [Time 194.0s]
Random sample:  the song had become an active for her best friend to

Epoch 4   Batch 2176/2176:  Train Loss 5.494 Valid Loss 5.286 [Time 194.4s]
Random sample:  the school in an english manner ; the portuguese is also

Epoch 5   Batch 2176/2176:  Train Loss 5.404 Valid Loss 5.238 [Time 194.7s]
Random sample:  the most important of a single level in their final game

Epoch 6   Batch 2176/2176:  Train Loss 5.326 Valid Loss 5.200 [Time 194.7s]
Random sample:  the united kingdom on august 15 ( 2012 and 26 september

Epoch 7   Batch 2176/2176:  Train Loss 5.258 Valid Loss 5.173 [Time 194.7s]
Random sample:  the second generatio

In [99]:
example1 = 'united states is a'
PredictLastWord(model, example1)

(tensor([-3.0596, -3.2084, -3.3047, -4.0073, -4.0813, -4.2020, -4.3376, -4.4397,
         -4.5406, -4.5797, -4.5951, -4.6088, -4.8496, -4.8573, -4.8644, -4.9803,
         -5.0343, -5.0644, -5.0869, -5.1010], device='cuda:0',
        grad_fn=<TopkBackward>),
 ['single',
  'major',
  '<unk>',
  '"',
  'popular',
  'national',
  'series',
  'commercial',
  'large',
  'state',
  'common',
  'part',
  'modern',
  'country',
  'non',
  'song',
  '2',
  'mixture',
  'record',
  'very'])

# Upload Saved Model and Generate Results

In [31]:
from google.colab import files

files.upload()

Saving best_lm_model.pth to best_lm_model.pth


In [0]:
model = LSTMModel(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, \
                      hidden_dim=HIDDEN_DIM, num_layers=NUM_LAYERS)
model.load_state_dict(torch.load('best_lm_model.pth'))
model = model.to('cuda')

## Predict the last word of a sentence

In [23]:
example2 = 'i live in new'
PredictLastWord(model, example2)

(tensor([-0.1895, -2.9968, -3.0378, -4.0996, -4.7299, -5.2429, -5.6764, -5.8815,
         -5.9518, -6.0597, -6.0897, -6.3670, -7.0204, -7.0247, -7.0433, -7.3135,
         -7.4211, -7.4310, -7.4386, -7.5110], device='cuda:0',
        grad_fn=<TopkBackward>),
 ['york',
  'zealand',
  'england',
  'jersey',
  'orleans',
  'south',
  'mexico',
  'world',
  'guinea',
  'rochelle',
  '<unk>',
  'delhi',
  'year',
  'garden',
  'music',
  'britain',
  'age',
  'hampshire',
  'spain',
  'life'])

## Generate random sentences starting with a seed word

In [85]:
# Generate 10 random sentences starting with a seed
seed_words = ['he', 'she']
for word_ in seed_words:
    print('='*90)
    print('Seed Word: ', word_)
    print('='*90)
    for i in range(10):
        print(GenerateRandom(model, word_, 15))

Seed Word:  he
he has since been appointed to his home of their children 's grandfather by james de
he was able for their second attempt with their former teammates and team partner of an
he is not considered an honorary doctorate ; his most influential work has come into three
he returned home for two seasons in three appearances on september 2 for a record eighth
he returned as to whether he would go up and sell to the other 's ,
he had the first and largest most famous player award , since he won four games
he also wrote , he is a welcome to an elderly film and was not an
he could only live up for another session , the only match the team could play
he also made appearances for an unnamed new team as he did it for their first
he has since the only one of their own children of birth and family as to
Seed Word:  she
she could do something to make it personally 's not the first , and he did
she also has two well received : she performed her career with two more songs in
she has a high 

## Generate sentences using Beam Search, starting with a seed phrase

In [90]:
seed_phrase = 'the war was won by'
result = BeamSearch(model, seed_phrase, sentence_len=20, beam_size=100)
for sent_ in result:
    print(sent_)

the war was won by the 1st and 2nd light horse brigades , the 1st and 2nd light horse brigades , and the 5th mounted
the war was won by the 1st and 2nd light horse brigades , the 1st and 2nd light horse brigades , the 5th mounted brigade
the war was won by the 1st and 2nd light horse brigades , and the 1st and 2nd light horse brigades , the 5th mounted
the war was won by the 1st and 2nd light horse brigades , the 1st and 2nd light horse brigades , and the 5th mounted
the war was won by the 1st and 2nd light horse brigades , the 1st and 2nd light horse brigades , and the 1st light
the war was won by the 1st and 2nd light horse brigade , the 1st and 2nd light horse brigades , and the 5th mounted
the war was won by the 1st and 2nd light horse brigades , and the 1st and 2nd light horse brigades , the 5th mounted
the war was won by the 1st and 2nd light horse brigades ( 1st and 2nd light horse brigades ) and the 3rd light horse
the war was won by the 1st and 2nd light horse brigades , the 1

In [0]:
from google.colab import files

files.download('best_lm_model.pth')