In [1]:
from tqdm import tqdm
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim

import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
from nltk.corpus import gutenberg as corpus

words = []
#words_to_remove = ['*0*', '-Fpa-', '-Fpt-']
#for s in tqdm(corpus.sents()[:1000]): # debug or quickly train the network
for s in tqdm(corpus.sents()):
    new_s = ['<s>'] + s[:-1] + ['</s>']
#    new_s = [w for w in new_s if w not in words_to_remove]
    words.extend(new_s)


100%|██████████| 98552/98552 [00:05<00:00, 19469.10it/s]


In [None]:
from torch.utils.data import Dataset, DataLoader

class FixedWindow(Dataset):
    def __init__(self, words, length_window):
        super().__init__()
        self.length_window = length_window
        
        self.vocab = set()
        for word in words:
          self.vocab.add(word)
        self.vocab = list(self.vocab)
        self.vocabulary_size = len(self.vocab)
        self.id2word = {id:word for id,word in enumerate(self.vocab)}
        self.word2id = {word:id for id,word in enumerate(self.vocab)}
        self.ids=[]
        for word in words:
          self.ids.append(self.word2id[word])




    def __len__(self):
        return len(self.ids) - self.length_window

    def __getitem__(self, idx):
        
        first_ids = torch.tensor(self.ids[idx:idx+self.length_window-1])
        last_id = torch.tensor(self.ids[idx+self.length_window-1])

        return first_ids, last_id



length_window = 5
dataset = FixedWindow(words, length_window)

x, y = dataset.__getitem__(10)
print('x = {}, y = {}'.format(x, y))

batch_size = 1000 # 5 to debug
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) # shuffle=False to debug

if True:
    for nbatch, (X, y) in enumerate(dataloader):
        print('batch {}'.format(nbatch))
        print('X = {}'.format(X))
        print('y = {}'.format(y))
        for x,z in zip(X.numpy(), y.numpy()):
            print([dataset.id2word[w] for w in x], end=' ')
            print(dataset.id2word[z])
        if nbatch==3:
            break


In [5]:
class NNLM(nn.Module):
    def __init__(self, num_classes, dim_input, dim_hidden, dim_embedding):
        super().__init__()
        self.num_classes = num_classes
        self.dim_input = dim_input
        self.dim_hidden = dim_hidden
        self.dim_embedding = dim_embedding
        self.embeddings = nn.Embedding(self.num_classes, self.dim_embedding) # embedding layer or look up table
        self.hidden1 = nn.Linear(self.dim_input * self.dim_embedding, self.dim_hidden, bias=False)
        self.ones = nn.Parameter(torch.ones(self.dim_hidden))
        self.hidden2 = nn.Linear(self.dim_hidden, self.num_classes, bias=False)
        self.hidden3 = nn.Linear(self.dim_input * self.dim_embedding, self.num_classes, bias=False) # final layer
        self.bias = nn.Parameter(torch.ones(self.num_classes))

    def forward(self, X):
        word_embeds = self.embeddings(X)
        X = word_embeds.view(-1, self.dim_input * self.dim_embedding) # first layer
        tanh = torch.tanh(self.ones + self.hidden1(X)) # tanh layer
        output = self.bias + self.hidden3(X) + self.hidden2(tanh) # summing up all the layers with bias
        return output


num_classes = dataset.vocabulary_size
dim_input = length_window - 1
dim_hidden = 50
dim_embedding = 32
learning_rate= 1e-3
num_epochs = 30

model = NNLM(num_classes, dim_input, dim_hidden, dim_embedding)

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

path = 'NNLM.pt'
do_train = True
do_test = True

In [6]:
# In the top menu go to Runtime -> Change runtime type and set Hardware
# accelerator to GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)
model = model.to(device)

cuda:0


In [None]:
from torch.cuda.random import device_count
if do_train:
    size = len(dataloader.dataset)
    for epoch in range(num_epochs):
        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)
            pred = model(X)
            loss = loss_fn(pred, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if batch % 100 == 0:
                loss, current = loss.item(), batch * batch_size
                print('Epoch {} loss: {:>7f}  [{:>5d}/{:>5d}]'
                    .format(epoch+1, loss, current, size))

    torch.save({'model_state_dict': model.state_dict()}, path)
else:
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint['model_state_dict'])

In [10]:
if do_test:
    num_sentences = 5
    max_num_words = 100

    nsent = 0
    generated_words = ['<s>', 'The', 'day', 'that']
    assert len(generated_words)==dim_input # length_window-1

    model.eval()
    with torch.no_grad():
        while (nsent < num_sentences) and (len(generated_words) < max_num_words):
            input_ids = None
            
            input_ids = [dataset.word2id[word] for word in generated_words[-dim_input:]]

            pred = model(torch.tensor(input_ids).unsqueeze(0).to(device))
            probs = torch.nn.functional.softmax(pred, dim=1)

            
            output_id=torch.multinomial(probs, 1)
            output_word = None
            
            output_word = dataset.id2word[output_id.item()]
            generated_words += [output_word]
            if output_word=='</s>':
                nsent += 1

    generated_text = ' '.join(generated_words)
    generated_text = generated_text.replace(' </s> <s>', '.').replace('<s> ','').replace(' </s>','.')
    for s in [' l\' ',' s\' ',' d\' ',]:
        generated_text = generated_text.replace(s, s[:-1])
    generated_text = generated_text.replace(' , ', ', ').replace('_',' ')
    print(generated_text)

The day that the Greenland whale was gone mad gaping activity, and in spite of him watch, ' appeared - separate here yet hitherto most comical things prior. There is another to whom ; and therefore pour out drink out the dreadful gulf, And spotted, hidden herds. So out of the others, he said, let. 3 : 2 Therefore, O Aholibah, to morrow after all, in the prophecy of the kings of Israel, and to build an house away that was in the
