In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (20, 14)

import requests
import string

import os
import json
import nltk
import spacy
import itertools
import numpy as np 
import pandas as pd
from PIL import Image
from scipy.spatial.distance import cdist
from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm as tqdm_
tqdm_.pandas()

import io
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

nlp = spacy.load('en')
nltk.download('punkt')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# very simple character level RNN
### get a text from gutenberg

In [None]:
texts = [requests.get('http://www.gutenberg.org/files/{}/{}.txt'.format(i, i)).text
         for i in np.random.choice(np.arange(start=1000, stop=1200), 10)]

text = '\n'.join(texts)
len(text)

### define how to chunk and vectorize text

In [None]:
all_characters = list(set(text))
label_encoder = LabelEncoder().fit(all_characters)

In [None]:
def random_chunk(text, chunk_length=100):
    start_index = np.random.randint(0, len(text) - chunk_length - 1)
    end_index = start_index + chunk_length
    input_text = text[start_index : end_index]
    target_character = text[end_index]
    return list(input_text), target_character

### dataset and dataloader

In [None]:
class ChunkDataset(Dataset):
    def __init__(self, text, label_encoder, length):
        self.text = text
        self.label_encoder = label_encoder
        self.length = length
        
    def __getitem__(self, index):
        input_text, target_character = random_chunk(self.text)
        
        input_indexes = self.label_encoder.transform(input_text)
        target_index = self.label_encoder.transform([target_character])
        return input_indexes, target_index

    def __len__(self):
        return self.length

In [None]:
train_dataset = ChunkDataset(text, label_encoder, length=80000)
test_dataset = ChunkDataset(text, label_encoder, length=2000)

In [None]:
batch_size = 200

train_loader = DataLoader(dataset=train_dataset, 
                          batch_size=batch_size, 
                          num_workers=5)

test_loader = DataLoader(dataset=test_dataset, 
                         batch_size=batch_size, 
                         num_workers=5)

### model

In [None]:
class SequenceEncoder(nn.Module):
    def __init__(self, num_embeddings, embedding_size, hidden_size, label_encoder):
        super(SequenceEncoder, self).__init__()
        self.label_encoder = label_encoder
        self.embedding = nn.Embedding(num_embeddings, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size)
        self.decoder = nn.Sequential(nn.Dropout(0.2),
                                     nn.Linear(hidden_size, num_embeddings))

    def forward(self, indexes):
        embedded = self.embedding(indexes)
        output, hidden = self.lstm(embedded)
        return self.decoder(output[:, -1])
    
    def predict_next_character(self, indexes):
        preds = self.forward(indexes)[-1]
        guess_index = preds.argmax()[0]
        next_character = self.label_encoder.inverse_transform([guess_index])
        return next_character[0]

In [None]:
losses = []
def train(model, train_loader, loss_function, optimiser, n_epochs):
    for epoch in range(n_epochs):
        model.train()
        loop = tqdm(train_loader)
        for inputs, targets in loop:
            inputs = inputs.cuda(non_blocking=True)
            targets = targets.cuda(non_blocking=True).view(-1)

            optimiser.zero_grad()
            preds = model(inputs)

            loss = loss_function(preds, targets)
            loss.backward()
            optimiser.step()
            
            losses.append(loss.item())
            loop.set_description('Epoch {}/{}'.format(epoch + 1, n_epochs))
            loop.set_postfix(loss=np.mean(losses[-20:]))

In [None]:
torch.backends.cudnn.benchmark = True

model = SequenceEncoder(num_embeddings=len(all_characters), 
                        embedding_size=128,
                        hidden_size=256,
                        label_encoder=label_encoder
                       ).to(device)

trainable_parameters = filter(lambda p: p.requires_grad, model.parameters())
optimiser = optim.Adam(trainable_parameters, lr=0.001)
loss_function = nn.CrossEntropyLoss()

In [None]:
train(model=model,
      train_loader=train_loader,
      loss_function=loss_function,
      optimiser=optimiser,
      n_epochs=10)

In [None]:
loss_data = pd.Series(losses).rolling(window=15).mean()
ax = loss_data.plot();
ax.set_ylim(0, 5);

In [None]:
text_chunk, target_character = random_chunk(text)
indexes = label_encoder.transform(text_chunk).reshape(-1,1)
indexes = torch.Tensor(indexes).long().cuda()

x = model(indexes)[-1]


print(''.join(text_chunk))
print('''
------------------------------
predicted character:\t{}
actual character:\t{}
      '''.format(label_encoder.inverse_transform([x.argmax()])[0],
                 target_character))

In [None]:
text_chunk, target_character = random_chunk(text)

def predict(text_chunk):
    indexes = label_encoder.transform(text_chunk)
    indexes = torch.Tensor(indexes).long().cuda().unsqueeze(0)
    return model.predict_next_character(indexes)

In [None]:
text_chunk, target_character = random_chunk(text)

for i in range(500):
    text_chunk.append(predict(text_chunk[-20:]))

In [None]:
''.join(text_chunk)