In [1]:
import torchtext # https://github.com/pytorch/text, used to prepare language model dataset
from torchtext.vocab import Vectors
import torch
import torch.nn as nn
import numpy as np
import random

In [2]:
USE_CUDA = torch.cuda.is_available()

# add seed to make sure the result can be reproduced
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)
if USE_CUDA:
    torch.cuda.manual_seed(1)
device = torch.device("cuda" if USE_CUDA else "cpu")

In [3]:
# hyper-parameters
BATCH_SIZE = 32
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 100
NLAYERS = 2
MAX_VOCAB_SIZE = 50000
NUM_EPOCHS = 2

In [4]:
TEXT = torchtext.data.Field(lower = True) # lowercase the text, https://pytorch.org/text/data.html#fields
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(path="./text", 
    train="text.train.txt", validation="text.dev.txt", test="text.test.txt", text_field=TEXT)

In [5]:
type(train)

torchtext.datasets.language_modeling.LanguageModelingDataset

In [6]:
# build vocabulary using torchtext， max_size restricts the vocabulary size (order by frequency)
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)

In [7]:
len(TEXT.vocab)

50002

In [8]:
TEXT.vocab.itos[:10] # torchtext adds two more word <unk> and <pad> into the vocabulary, itos is index to string

['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']

In [9]:
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits(
    (train, val, test), batch_size=BATCH_SIZE, device=device, bptt_len=64, repeat=False, shuffle=True)

In [10]:
# examine a training batch
it = iter(train_iter)
batch = next(it)

In [11]:
batch # batch.text and batch.target is of shape [seq_len, batch_size]


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 64x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 64x32 (GPU 0)]

In [12]:
print(" ".join([TEXT.vocab.itos[i] for i in batch.text[:,1].data])) # the first batch of batch.text has 64 words (seq_len)

combine in pairs and then group into trios of pairs which are the smallest visible units of matter this parallels with the structure of modern atomic theory in which pairs or triplets of supposedly fundamental quarks combine to create most typical forms of matter they had also suggested the possibility of splitting an atom which as we know today is the source of atomic


In [13]:
print(" ".join([TEXT.vocab.itos[i] for i in batch.target[:,1].data])) # the first batch of batch.traget is just one word offset of batch.text

in pairs and then group into trios of pairs which are the smallest visible units of matter this parallels with the structure of modern atomic theory in which pairs or triplets of supposedly fundamental quarks combine to create most typical forms of matter they had also suggested the possibility of splitting an atom which as we know today is the source of atomic energy


In [14]:
# define RNN model
class RNNModel(nn.Module):
    
    def __init__(self, vocab_size, embed_size, hidden_size, nlayers, bidirectional=True, dropout=0.3):
        super(RNNModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, 
                            num_layers=nlayers, 
                            bidirectional=bidirectional, 
                            dropout=dropout)
        self.fc = nn.Linear(hidden_size * 2, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.nlayers = nlayers
        self.hidden_size = hidden_size
    
    def forward(self, text, hidden):
        '''
        text: seq_length * batch_size
        '''
        embed = self.dropout(self.embed(text)) # (seq_length, batch_size, embed_size)
        output, hidden = self.lstm(embed, hidden) # output: (seq_length, batch, num_directions * hidden_size), hidden: (nlayers*num_directions, batch, hidden_size), cell: (nlayers*num_directions, batch, hidden_size)
        # use output here because we want to know the prediction at every time t rather than only the last stage
        out_vocab = self.fc(output.view(-1, output.shape[2])) # (seq_len * batch_size, vocab_size)
        out_vocab = out_vocab.view(output.shape[0], output.shape[1], -1) # (seq_len, batch_size, vocab_size)

        return out_vocab, hidden
    
    def init_hidden(self, batch_size, requires_grad = True):
        # initialize the first hidden and cell state of lstm
        weight = next(self.parameters()) # smart way to avoid checking if the model weight is in gpu or not
        return (weight.new_zeros((self.nlayers * 2, batch_size, self.hidden_size), requires_grad=requires_grad),
                    weight.new_zeros((self.nlayers * 2, batch_size, self.hidden_size), requires_grad=requires_grad))

In [15]:
model = RNNModel(vocab_size = len(TEXT.vocab), 
                 embed_size = EMBEDDING_SIZE, 
                 hidden_size = HIDDEN_SIZE, 
                 nlayers = NLAYERS, 
                 bidirectional=True, 
                 dropout=0.3)
if USE_CUDA:
    model = model.to(device)

In [16]:
USE_CUDA

True

In [17]:
model

RNNModel(
  (embed): Embedding(50002, 100)
  (lstm): LSTM(100, 100, num_layers=2, dropout=0.3, bidirectional=True)
  (fc): Linear(in_features=200, out_features=50002, bias=True)
  (dropout): Dropout(p=0.3, inplace=False)
)

In [18]:
# detach hidden state so that BPTT will not work on all the words. Instead, set a new starting position in each batch
def repackage_hidden(hidden):
    if isinstance(hidden, torch.Tensor):
        return hidden.detach()
    else:
        return tuple(repackage_hidden(v) for v in hidden)
    
def evaluate(model, val_iter):
    model.eval()
    total_loss = 0.
    it = iter(val_iter)
    total_count = 0.
    
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
    
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target # data: (seq_length, batch_size)
            if USE_CUDA:
                data, target = data.cuda(), target.cuda()

            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden) # output: (seq_len, batch_size, vocab_size)
            loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
            total_count += np.multiply(*data.size())
            total_loss += loss.item() * np.multiply(*data.size()) # loss is averged by seq_length * batch_size
           
            
    model.train()
    return total_loss / total_count

In [19]:
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) # every time we call this, learning rate reduces by 50%

In [20]:
VOCAB_SIZE = len(TEXT.vocab)
GRAD_CLIP = 5.0

In [21]:
val_losses = []

for epoch in range(NUM_EPOCHS):
    model.train()
    
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)
    
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target # data: (seq_length, batch_size)
        if USE_CUDA:
            data, target = data.cuda(), target.cuda()
            
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden) # output: (seq_len, batch_size, vocab_size)
        loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1)) # adjust shape based on the requirements of cross entropy loss
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP) # apply gradient clipping
        optimizer.step()
        
        if i % 1000 == 0:
            print("epoch", epoch, "loss", loss.item())
            
        # save model + learning rate decay
#         if i % 10000 == 0:
#             val_loss = evaluate(model, val_iter)
#             if len(val_losses) == 0 or val_loss < min(val_losses):
#                 torch.save(model.state_dict(), "lm.pth")
#                 print("best model saved to lm.pth")
#             else:
#                 # learning rate decay
#                 scheduler.step()

epoch 0 loss 10.822580337524414
epoch 0 loss 4.204793930053711
epoch 0 loss 2.6991500854492188
epoch 0 loss 2.2078709602355957
epoch 0 loss 1.6467599868774414
epoch 0 loss 1.3863930702209473
epoch 0 loss 1.1007905006408691
epoch 0 loss 1.0194844007492065
epoch 1 loss 0.8709428906440735
epoch 1 loss 0.8289773464202881
epoch 1 loss 0.77018141746521
epoch 1 loss 0.7213648557662964
epoch 1 loss 0.6132504343986511
epoch 1 loss 0.5537371635437012
epoch 1 loss 0.5588538646697998
epoch 1 loss 0.5723716616630554


In [None]:
# # load model
# best_model = RNNModel(vocab_size = len(TEXT.vocab), 
#                  embed_size = EMBEDDING_SIZE, 
#                  hidden_size = HIDDEN_SIZE, 
#                  nlayers = NLAYERS, 
#                  bidirectional=True, 
#                  dropout=0.3)
# if USE_CUDA:
#     best_model = best_model.to(device)
    
# best_model.load_state_dict(torch.load("lm.pth"))

In [22]:
evaluate(model, val_iter)

0.33034970410389697

In [23]:
hidden = model.init_hidden(1)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
inputs = torch.randint(VOCAB_SIZE, (1, 1), dtype=torch.long).to(device)
words = []
for i in range(100):
    output, hidden = model(inputs, hidden) # output: (1,1, vocab_size)
    word_weights = output.squeeze().exp().cpu()
    word_idx = torch.multinomial(word_weights, 1)[0] # based on the weights of each word, use multinomial distribution to select next work. Greedy serach using torch.max is deterministic which is used more often in translation
    inputs.fill_(word_idx)
    word = TEXT.vocab.itos[word_idx]
    words.append(word)
print(" ".join(words))

extensive labour formal labour precursors labour fine labour fine labour fine taj fine meat fine meat fine meat fine meat fine meat fine meat fine meat fine meat fine meat drug meat drug meat drug meat drug meat drug meat drug meat drug meat drug meat drug meat courage meat timbre meat lesion meat latex proposing latex proposing latex proposing but proposing but proposing but proposing but proposing but proposing but proposing but proposing but deuterium but sorceress but underway but underway but underway but underway but react but react but react but react but react but react but react
