# Language Model with RNN

In this notebook, we will build language model (sequence model) with RNN. We can think of it as a "deep" version of the classical N-gram language model. The trained model will allow us to generate sentences given the words (prefixes) we provided.

We will use [The Time Machine](https://www.gutenberg.org/files/35/35-0.txt) as used in D2L to train the model. This si a faily small corpus of just over 30000 words.

The entire process will follow the below steps:
1. Read the dataset, preprocess to remove spicial characters, tokenize the text, build vocabulary and transform tokens into ids.
2. Build data iterator for modeling training.
3. Create RNN modules
4. Building training and prediction steps.

# Preparing the data

## Preprocessing, tokenization, vocabulary building and token transformation

In [9]:
import re
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.transforms import VocabTransform

In [6]:
# Read the text 
def read_time_machine():
    with open("datasets/the_time_machine.txt") as f:
        lines = f.readlines()
    return [re.sub('[^A-Za-z]+',' ', line).strip().lower()  for line in lines]

In [13]:
# Token list iterator for generating vocab
def yield_tokens(data_iter, tokenizer):
    for text in data_iter:
        yield tokenizer(text)

In [14]:
def load_corpus_time_machine(min_freq = 1, max_tokens = None):
    lines = read_time_machine()
    tokenizer = get_tokenizer("basic_english")
    vocab = build_vocab_from_iterator(yield_tokens(lines,tokenizer), specials=["<unk>"], min_freq = min_freq, max_tokens=max_tokens)
    vocab.set_default_index(vocab["<unk>"])
    vocab_transform = VocabTransform(vocab)
    corpus = vocab_transform(tokenizer(" ".join(lines)))
    return corpus, vocab

In [22]:
corpus, vocab = load_corpus_time_machine()

## Create data iterator

In [25]:
import random
import torch

In [39]:
## Randon sampling: In random sampling, each example is a subsequence arbitrarily captured on the original long sequence.
def seq_data_iter_random(corpus, batch_size, num_steps):
    corpus = corpus[random.randint(0,num_steps-1):] #include random offset at the beginning
    num_subseqs = (len(corpus)-1)//num_steps  #Subtract 1 since we need to account for labels
    initial_indices = list(range(0, num_subseqs*num_steps, num_steps))
    random.shuffle(initial_indices)

    def data(pos):
        return corpus[pos:pos+num_steps]
    num_batches = num_subseqs//batch_size
    for i in range(0, batch_size*num_batches, num_batches):
        initial_indices_per_batch = initial_indices[i:i+batch_size]
        X = [data(j) for j in initial_indices_per_batch]
        Y = [data(j+1) for j in initial_indices_per_batch]
        yield torch.tensor(X), torch.tensor(Y)


In [40]:
my_seq = list(range(35))
for X, Y in seq_data_iter_random(my_seq,batch_size=2, num_steps=5):
    print("X:",X,"\nY:",Y)

X: tensor([[11, 12, 13, 14, 15],
        [21, 22, 23, 24, 25]]) 
Y: tensor([[12, 13, 14, 15, 16],
        [22, 23, 24, 25, 26]])
X: tensor([[16, 17, 18, 19, 20],
        [26, 27, 28, 29, 30]]) 
Y: tensor([[17, 18, 19, 20, 21],
        [27, 28, 29, 30, 31]])


In [43]:
## Sequential Partitioning: In this partitioning, we ensure that the subsequences from two adjacent minibatches during iteration 
## are adjacent on the original sequence.
def seq_data_iter_sequential(corpus, batch_size, num_steps):
    offset = random.randint(0, num_steps)
    num_tokens = ((len(corpus)-offset-1)//batch_size) * batch_size
    Xs = torch.tensor(corpus[offset: offset+num_tokens]).reshape(batch_size,-1)
    Ys = torch.tensor(corpus[offset+1: offset+1+num_tokens]).reshape(batch_size,-1)
    num_batches = Xs.shape[1]// num_steps
    for i in range(0, num_steps * num_batches, num_steps):
        X = Xs[:, i:i+num_steps]
        Y = Ys[:, i:i+num_steps]
        yield X, Y
    

In [44]:
my_seq = list(range(35))
for X, Y in seq_data_iter_sequential(my_seq,batch_size=2, num_steps=5):
    print("X:",X,"\nY:",Y)

X: tensor([[ 1,  2,  3,  4,  5],
        [17, 18, 19, 20, 21]]) 
Y: tensor([[ 2,  3,  4,  5,  6],
        [18, 19, 20, 21, 22]])
X: tensor([[ 6,  7,  8,  9, 10],
        [22, 23, 24, 25, 26]]) 
Y: tensor([[ 7,  8,  9, 10, 11],
        [23, 24, 25, 26, 27]])
X: tensor([[11, 12, 13, 14, 15],
        [27, 28, 29, 30, 31]]) 
Y: tensor([[12, 13, 14, 15, 16],
        [28, 29, 30, 31, 32]])


In [45]:
class SeqDataLoader:
    def __init__(self, corpus, batch_size, num_steps, use_random_iter):
        if use_random_iter:
            self.data_iter_fn = seq_data_iter_random
        else:
            self.data_iter_fn = seq_data_iter_sequential
        self.corpus = corpus
        self.batch_size = batch_size
        self.num_steps = num_steps

    def __iter__(self):
        return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)

def load_data_time_machine(batch_size, num_steps, use_random_iter = False, max_tokens = None, min_freq=1):
    corpus, vocab = load_corpus_time_machine(min_freq = min_freq, max_tokens = max_tokens)
    data_iter = SeqDataLoader(corpus, batch_size, num_steps, use_random_iter)
    return data_iter, vocab

In [46]:
data_iter, vocab = load_data_time_machine(2, 10)

# Create RNN modules

In [51]:
from torch import nn
import torch.nn.functional as F

In [57]:
class RNNModel(nn.Module):
    def __init__(self, rnn_layer, vocab_size, embed_size=-1):
        super(RNNModel,self).__init__()
        self.rnn = rnn_layer
        self.vocab_size = vocab_size
        self.num_hiddens = self.rnn.hidden_size
        if embed_size >=0:
            self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_size)
        else:
            self.embedding_layer = lambda X: F.one_hot(X.T.long(), vocab_size)
        
        if not self.rnn.bidirectional:
            self.num_directions = 1
            self.linear = nn.Linear(self.num_hiddens, self.vocab_size)
        else:
            self.num_directions = 2
            self.linear = nn.Linear(self.num_hiddens*2, self.vocab_size)

    def forward(self, inputs, state):
        X = self.embedding_layer(inputs)
        X = X.to(torch.float32)
        Y, state = self.rnn(X, state)
        output = self.linear(Y.reshape((-1, Y.shape[-1])))
        return output, state
    
    def begin_state(self, device, batch_size =1):
        if not isinstance(self.rnn, nn.LSTM):
            return torch.zeros((self.num_directions*self.rnn.num_layers, 
                                batch_size, self.num_hiddens), device=device)
        else:
            return (torch.zeros((self.num_directions*self.rnn.num_layers, 
                                batch_size, self.num_hiddens), device=device),
                    torch.zeros((self.num_directions*self.rnn.num_layers, 
                                batch_size, self.num_hiddens), device=device))

# Prediction

In [62]:
def rnn_pred(prefix, num_preds, net, vocab, device):
    state = net.begin_state(batch_size=1, device=device)
    outputs = [vocab[prefix[0]]]
    get_inputs = lambda: torch.tensor([outputs[-1]],device=device).reshape((1,1))
    for y in prefix[1:]:
        _, state = net(get_inputs(), state)
        outputs.append(vocab[y])
    for _ in range(num_preds):
        y, state = net(get_inputs(), state)
        outputs.append(int(y.argmax(dim=1).reshape(1)))
    return ' '.join([vocab.get_itos()[i] for i in outputs])

In [55]:
vocab_size = len(vocab)
hidden_size = 256
rnn_layer = nn.RNN(vocab_size, hidden_size)

In [59]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")  ## Only works on M-Series Mac
net = RNNModel(rnn_layer, vocab_size=vocab_size)
net = net.to(device)

In [64]:
rnn_pred(["my", "wife", "has"], 10, net, vocab, device)

'my <unk> has dimly swamp returned stuffy taken pathway expect decorations casting expect'

# Training

In [None]:
def train_epoch(net, train_iter, loss, optimizer, device, use_random_iter=False):
    state = None
    for X, Y in train_iter:
        if state is None or use_random_iter:
            state = net.begin_state