# II Character RNN (LSTM) 

In [2]:
import string

In [None]:
import re
import time
import torch
import random
import string
import unidecode
import matplotlib.pyplot as plt


torch.backends.cudnn.deterministic = True

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

In [None]:
sys.path.append('/content/drive/My Drive/Colab Notebooks')

In [None]:
DEVICE            = torch.device('cpu')
NUM_ITER          = 5000
HIDDEN_DIM        = 100
RANDOM_SEED       = 123
LEARNING_RATE     = 0.005
EMBEDDING_DIM     = 100
NUM_HIDDEN_LAYERS = 1
TEXT_PORTION_SIZE = 200

torch.manual_seed(RANDOM_SEED)

In [3]:
string.printable

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [None]:
shutil.copy('/content/drive/My Drive/Colab Notebooks/datasets/covid19_faq.txt', 'covid19_faq.txt')

In [None]:
with open('covid19_faq.txt', 'r') as f:
    textfile = f.read()

# convert special characters
textfile = unidecode.unidecode(textfile)

# strip extra whitespaces
textfile = re.sub(' +',' ', textfile)

TEXT_LENGTH = len(textfile)

print(f'Number of characters in text: {TEXT_LENGTH}')

In [None]:
random.seed(RANDOM_SEED)

def random_portion(textfile):
    start_index = random.randint(0, TEXT_LENGTH - TEXT_PORTION_SIZE)
    end_index = start_index + TEXT_PORTION_SIZE + 1
    return textfile[start_index:end_index]

print(random_portion(textfile))

In [None]:
def char_to_tensor(text):
    lst = [string.printable.index(c) for c in text]
    tensor = torch.tensor(lst).long()
    return tensor

print(char_to_tensor('abcDEF'))

In [None]:
def draw_random_sample(textfile):    
    text_long = char_to_tensor(random_portion(textfile))
    inputs = text_long[:-1]
    targets = text_long[1:]
    return inputs, targets

In [None]:
draw_random_sample(textfile)

In [None]:
class RNN(torch.nn.Module):
    def __init__(self, input_size, embed_size,
                 hidden_size, output_size, num_layers):
        super().__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.embed = torch.nn.Embedding(num_embeddings=input_size,
                                        embedding_dim=embed_size)
        self.rnn = torch.nn.LSTM(input_size=embed_size,
                                 hidden_size=hidden_size,
                                 num_layers=num_layers)
        self.fc = torch.nn.Linear(hidden_size, output_size)
    
    def forward(self, features, hidden_and_cell_state):
        # text dim: [1] -> [[1]] (features = 1 character)
        features = features.view(1, -1)
        
        # embedded dim = [text length, batch size, embedding dim] = [1, 1, embedding dim]
        embedded = self.embed(features)
        
        output, hidden_and_cell_state = self.rnn(embedded, hidden_and_cell_state)
        # 1. output dim: [sentence length, batch size, hidden dim] = [1, 1, hidden dim]
        # 2. (last) hidden dim: [num layers, batch size, hidden dim] = [num layers, 1, hidden dim]
        # 3. cell dim: [num layers, batch size, hidden dim] = [num layers, 1, hidden dim]
        
        output.squeeze_(0)
        output = self.fc(output)
        return output, hidden_and_cell_state
      
    def init_zero_state(self):
        init_hidden = torch.zeros(self.num_layers, 1, self.hidden_size).to(DEVICE)
        init_cell = torch.zeros(self.num_layers, 1, self.hidden_size).to(DEVICE)
        return (init_hidden, init_cell)

In [None]:
torch.manual_seed(RANDOM_SEED)
model = RNN(len(string.printable), EMBEDDING_DIM, HIDDEN_DIM,
            len(string.printable), NUM_HIDDEN_LAYERS)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
def evaluate(model, prime_str='A', predict_len=100, temperature=0.8):
    ## based on https://github.com/spro/practical-pytorch/
    ## blob/master/char-rnn-generation/char-rnn-generation.ipynb

    hidden_and_cell_state = model.init_zero_state()
    prime_input = char_to_tensor(prime_str)
    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden_and_cell_state = model(prime_input[p].to(DEVICE), hidden_and_cell_state)
    inp = prime_input[-1]
    
    for p in range(predict_len):
        output, hidden_and_cell_state = model(inp.to(DEVICE), hidden_and_cell_state)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        
        # Add predicted character to string and use as next input
        predicted_char = string.printable[top_i]
        predicted += predicted_char
        inp = char_to_tensor(predicted_char)

    return predicted

In [None]:
start_time = time.time()

loss_list = []

for iteration in range(NUM_ITER):

    hidden_and_cell_state = model.init_zero_state()
    optimizer.zero_grad()
    
    loss = 0.
    inputs, targets = draw_random_sample(textfile)
    inputs, targets = inputs.to(DEVICE), targets.to(DEVICE)
    for c in range(TEXT_PORTION_SIZE):
        outputs, hidden_and_cell_state = model(inputs[c], hidden_and_cell_state)
        loss += torch.nn.functional.cross_entropy(outputs, targets[c].view(1))

    loss /= TEXT_PORTION_SIZE
    loss.backward()
    
    ### UPDATE MODEL PARAMETERS
    optimizer.step()

    ### LOGGING
    with torch.no_grad():
        if iteration % 200 == 0:
            print(f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
            print(f'Iteration {iteration} | Loss {loss.item():.2f}\n\n')
            print(evaluate(model, 'Th', 200), '\n')
            print(50*'=')
            
            loss_list.append(loss.item())
            plt.clf()
            plt.plot(range(len(loss_list)), loss_list)
            plt.ylabel('Loss')
            plt.xlabel('Iteration x 1000')
            plt.savefig('loss2.pdf')
            
plt.clf()
plt.ylabel('Loss')
plt.xlabel('Iteration x 1000')
plt.plot(range(len(loss_list)), loss_list)
plt.show()