<a href="https://colab.research.google.com/github/variable-jain/Deep-learning-experiments-with-pytorch-and-fastai/blob/main/Language_Model_from_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# A Language Model from Scratch in Pytorch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -Uqq fastbook
import fastbook
fastbook.setup_book()

In [2]:
from fastai.text.all import *

## The Data

In [3]:
path = untar_data(URLs.HUMAN_NUMBERS)
path.ls()

(#2) [Path('/root/.fastai/data/human_numbers/valid.txt'),Path('/root/.fastai/data/human_numbers/train.txt')]

In [4]:
lines = L()
with open(path/'train.txt') as f: lines += L(*f.readlines())
with open(path/'valid.txt') as f: lines += L(*f.readlines())
lines

(#9998) ['one \n','two \n','three \n','four \n','five \n','six \n','seven \n','eight \n','nine \n','ten \n'...]

In [5]:
text = ' . '.join([l.strip() for l in lines])
text[:100]

'one . two . three . four . five . six . seven . eight . nine . ten . eleven . twelve . thirteen . fo'

In [6]:
tokens = text.split(' ')
tokens[:10]

['one', '.', 'two', '.', 'three', '.', 'four', '.', 'five', '.']

In [7]:
vocab = L(*tokens).unique()
vocab

(#30) ['one','.','two','three','four','five','six','seven','eight','nine'...]

In [8]:
word2idx = {w:i for i,w in enumerate(vocab)}
nums = L(word2idx[t] for t in tokens)
nums

(#63095) [0,1,2,1,3,1,4,1,5,1...]

## Our first language model from scratch
predicting each word based on previous three words

### Setting up data
Creating dataloaders

In [9]:
L((tokens[i:i+3], tokens[i+3]) for i in range(0,len(tokens)-4,3))

(#21031) [(['one', '.', 'two'], '.'),(['.', 'three', '.'], 'four'),(['four', '.', 'five'], '.'),(['.', 'six', '.'], 'seven'),(['seven', '.', 'eight'], '.'),(['.', 'nine', '.'], 'ten'),(['ten', '.', 'eleven'], '.'),(['.', 'twelve', '.'], 'thirteen'),(['thirteen', '.', 'fourteen'], '.'),(['.', 'fifteen', '.'], 'sixteen')...]

In [10]:
seqs = L((tensor(nums[i:i+3]), nums[i+3]) for i in range(0,len(nums)-4,3))
seqs

(#21031) [(tensor([0, 1, 2]), 1),(tensor([1, 3, 1]), 4),(tensor([4, 1, 5]), 1),(tensor([1, 6, 1]), 7),(tensor([7, 1, 8]), 1),(tensor([1, 9, 1]), 10),(tensor([10,  1, 11]), 1),(tensor([ 1, 12,  1]), 13),(tensor([13,  1, 14]), 1),(tensor([ 1, 15,  1]), 16)...]

In [11]:
bs = 64
cut = int(len(seqs) * 0.8)
dls = DataLoaders.from_dsets(seqs[:cut], seqs[cut:], bs=64, shuffle=False)

### The language model

In [12]:
class LMModel1(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        
    def forward(self, x):
        h = F.relu(self.h_h(self.i_h(x[:,0])))
        h = h + self.i_h(x[:,1])
        h = F.relu(self.h_h(h))
        h = h + self.i_h(x[:,2])
        h = F.relu(self.h_h(h))
        return self.h_o(h)

In [13]:
learn = Learner(dls, LMModel1(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.824297,1.970941,0.467554,00:01
1,1.386973,1.823242,0.467554,00:01
2,1.417556,1.654497,0.494414,00:01
3,1.37644,1.650849,0.494414,00:01


### Recurrent Neural Network

In [14]:
class LMModel2(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        
    def forward(self, x):
        h = 0
        for i in range(3):
            h = h + self.i_h(x[:,i])
            h = F.relu(self.h_h(h))
        return self.h_o(h)

In [15]:
learn = Learner(dls, LMModel2(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy)
learn.fit_one_cycle(4, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.854185,1.983789,0.467079,00:01
1,1.404229,1.786536,0.468029,00:01
2,1.43214,1.702118,0.489422,00:01
3,1.39879,1.681968,0.47112,00:01


## Improving the Recurrent Neural Network

We will work on three issues:
1. Maintaining the state of an RNN
2. Creating more signal
3. Add more rnn layers

### Maintaining the state of the RNN

In [16]:
class LMModel3(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0
    
    def forward(self, x):
        for i in range(3):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
        out = self.h_o(self.h)
        self.h = self.h.detach()
        return out
    
    def reset(self): self.h = 0

In [17]:
def group_chunks(ds, bs):
    m = len(ds)//bs
    new_ds = L()
    for i in range(m): new_ds += L(ds[i+m*j] for j in range(bs))
    return new_ds

In [18]:
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(
    group_chunks(seqs[:cut], bs),
    group_chunks(seqs[cut:], bs),
    bs=bs, drop_last=True, shuffle=False)

In [20]:
learn = Learner(dls, LMModel3(len(vocab), 64), loss_func=F.cross_entropy, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(10, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.693835,1.807487,0.483413,00:01
1,1.261559,1.720321,0.488462,00:01
2,1.068692,1.688939,0.476202,00:01
3,1.017563,1.58986,0.496154,00:01
4,0.967448,1.629246,0.533654,00:01
5,0.909269,1.677847,0.542548,00:01
6,0.888032,1.633154,0.579087,00:01
7,0.818515,1.646275,0.587019,00:01
8,0.778004,1.739808,0.595433,00:01
9,0.763503,1.742119,0.593029,00:01


### Creating more signal

In [23]:
sl = 16
seqs = L((tensor(nums[i:i+sl]), tensor(nums[i+1:i+sl+1])) for i in range(0, len(nums)-sl-1))
cut = int(len(seqs)*0.8)
dls = DataLoaders.from_dsets(group_chunks(seqs[:cut], bs),
                             group_chunks(seqs[cut:], bs),
                             bs=bs, drop_last=True, shuffle=False)

In [24]:
[L(vocab[o] for o in s) for s in seqs[0]]

[(#16) ['one','.','two','.','three','.','four','.','five','.'...],
 (#16) ['.','two','.','three','.','four','.','five','.','six'...]]

In [25]:
class LMModel4(Module):
    def __init__(self, vocab_sz, n_hidden):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.h_h = nn.Linear(n_hidden, n_hidden)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = 0
    
    def forward(self, x):
        outs = []
        for i in range(sl):
            self.h = self.h + self.i_h(x[:,i])
            self.h = F.relu(self.h_h(self.h))
            outs.append(self.h_o(self.h))
        self.h = self.h.detach()
        return torch.stack(outs, dim=1)
    
    def reset(self): self.h = 0

In [26]:
def loss_function(inp, targ):
    return F.cross_entropy(inp.view(-1, len(vocab)), targ.view(-1))

In [27]:
learn = Learner(dls, LMModel4(len(vocab), 64), loss_func=loss_function, metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.381184,1.892017,0.444713,00:10
1,1.126689,1.917279,0.481188,00:10
2,0.894251,1.899269,0.499881,00:10
3,0.699333,2.327491,0.510807,00:09
4,0.597686,2.460619,0.557399,00:09
5,0.528377,2.311562,0.517712,00:09
6,0.458838,2.392732,0.564265,00:10
7,0.394376,2.256725,0.588307,00:09
8,0.400234,2.227837,0.560958,00:10
9,0.379731,2.083127,0.595113,00:10


### Multilayer RNN

In [29]:
class LMModel5(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.RNN(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = torch.zeros(n_layers, bs, n_hidden)
    
    def forward(self, x):
        res, h = self.rnn(self.i_h(x), self.h)
        self.h = h.detach()
        return self.h_o(res)
    
    def reset(self): self.h.zero_()

In [30]:
learn = Learner(dls, LMModel5(len(vocab), 64, 2), 
                loss_func=CrossEntropyLossFlat(), 
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 3e-3)

epoch,train_loss,valid_loss,accuracy,time
0,1.35493,1.926992,0.425568,00:11
1,0.781078,2.096872,0.464115,00:11
2,0.503999,2.429739,0.463768,00:11
3,0.40882,2.927918,0.473529,00:11
4,0.35109,2.883964,0.531414,00:11
5,0.310843,2.985979,0.546067,00:11
6,0.263559,2.98191,0.572102,00:11
7,0.237775,2.987018,0.584004,00:11
8,0.213385,2.967636,0.605538,00:11
9,0.196461,3.167815,0.595827,00:11


## Special types of RNN - **LSTMs**

In [35]:
class LMModel6(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]

    def forward(self, x):
        res, h = self.rnn(self.i_h(x), self.h)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(res)
    
    def reset(self): 
        for h in self.h: h.zero_()

In [36]:
learn = Learner(dls, LMModel6(len(vocab), 64, 2), 
                loss_func=CrossEntropyLossFlat(),
                metrics=accuracy, cbs=ModelResetter)
learn.fit_one_cycle(15, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.965442,2.3322,0.398522,00:21
1,0.112696,2.195316,0.616102,00:21
2,0.051219,1.814923,0.694048,00:21
3,0.039303,2.283034,0.674443,00:20
4,0.034494,2.378582,0.639366,00:21
5,0.028826,1.779936,0.687089,00:21
6,0.02495,1.600827,0.714313,00:21
7,0.022526,1.742368,0.732144,00:21
8,0.020706,1.381811,0.790332,00:21
9,0.018371,1.342356,0.787481,00:21


### Regularizing LSTMs

We use 4 types of regularization:
1. Dropout
2. Weight Tying
3. Activation Regularization and Temporal Activation Regularization (*AR* and *TAR*)
4. Weigth decay

In [39]:
class LMModel7(Module):
    def __init__(self, vocab_sz, n_hidden, n_layers, p):
        self.i_h = nn.Embedding(vocab_sz, n_hidden)
        self.rnn = nn.LSTM(n_hidden, n_hidden, n_layers, batch_first=True)
        self.drop = nn.Dropout(p)
        self.h_o = nn.Linear(n_hidden, vocab_sz)
        self.h_o.weight = self.i_h.weight
        self.h = [torch.zeros(n_layers, bs, n_hidden) for _ in range(2)]
    
    def forward(self, x):
        raw, h = self.rnn(self.i_h(x), self.h)
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in h]
        return self.h_o(out), raw, out
    
    def reset(self): 
        for h in self.h: h.zero_()

In [45]:
# learn = Learner(dls, LMModel7(len(vocab), 64, 2, 0.5), 
#                 loss_func=CrossEntropyLossFlat(), metrics=accuracy, 
#                 cbs=[ModelResetter, RNNRegularizer(alpha=2, beta=1)])

In [48]:
learn = TextLearner(dls, LMModel7(len(vocab), 64, 2, 0.5),
                    loss_func=CrossEntropyLossFlat(), metrics=accuracy)

In [49]:
learn.fit_one_cycle(15, 1e-2, wd=0.1)

epoch,train_loss,valid_loss,accuracy,time
0,0.591436,1.350341,0.646241,00:21
1,0.188662,1.395507,0.701221,00:21
2,0.09217,1.679307,0.722022,00:21
3,0.063405,1.429932,0.74764,00:21
4,0.058303,1.265193,0.813541,00:21
5,0.04958,1.591477,0.781904,00:21
6,0.045117,1.614353,0.711954,00:21
7,0.039381,1.533823,0.830539,00:21
8,0.036258,1.314088,0.82055,00:21
9,0.035093,1.632174,0.80205,00:21
