<a href="https://colab.research.google.com/github/usm-cos-432/InClass/blob/master/ProblemSets/CharRNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#COS 432 Character level RNN

The source is based on https://github.com/spro/char-rnn.pytorch
and 
https://colab.research.google.com/drive/1ezg4K2VBe2BqmMd43XGukMESExF3wgDM

In [None]:
from os import path

In [None]:
#!pip install -q tqdm
from tqdm import tqdm

In [None]:
import torch
import math
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torchvision
import numpy as np
import matplotlib.pyplot as plt
from torch.autograd import Variable
import torchvision.transforms as transforms
from IPython import display
import time


In [None]:
use_cuda = True

# Dataset
Download Shakespeare, preprocess and Display some examples

In [None]:
import requests
import string
import random

all_characters = string.printable
n_characters = len(all_characters)

def DownloadFile(url):
    local_filename = url.split('/')[-1]
    r = requests.get(url)
    return r.text

def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        try:
            tensor[c] = all_characters.index(string[c])
        except:
            continue
    return tensor  

def random_training_set(chunk_len, batch_size, file):
    inp = torch.LongTensor(batch_size, chunk_len)
    target = torch.LongTensor(batch_size, chunk_len)
    for bi in range(batch_size):
        start_index = random.randint(0, len(file) - chunk_len -1)
        end_index = start_index + chunk_len + 1
        chunk = file[start_index:end_index]
        inp[bi] = char_tensor(chunk[:-1])
        target[bi] = char_tensor(chunk[1:])
    inp = Variable(inp)
    target = Variable(target)
    if use_cuda:
        inp = inp.cuda()
        target = target.cuda()
    return inp, target

def time_since(since):
    s = time.time() - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)
  
target_url = "https://raw.githubusercontent.com/cos495/code/master/shakespeare.txt"
data = DownloadFile(target_url)
#print(random_training_set(10, 8, data))
print(data[10:100])

#Model
In this code we use Pytorch already implemented Recurrent Neural Network Cell computation with `nn.RNN` and `nn.LSTM`

In [None]:
# https://github.com/spro/char-rnn.pytorch
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, model="rnn", n_layers=1):
        super(CharRNN, self).__init__()
        self.model = model.lower()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, n_layers)
        if model=="lstm":
          self.rnn = nn.LSTM(hidden_size, hidden_size, n_layers)
          
        self.decoder = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        batch_size = input.size(0)
        encoded = self.encoder(input)
        output, hidden = self.rnn(encoded.view(1, batch_size, -1), hidden)
        output = self.decoder(output.view(batch_size, -1))
        return output, hidden

    def init_hidden(self, batch_size):
        if self.model == "lstm":
            return (Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)),
                    Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size)))
        return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))

#Train

###Iinitialize the model

In [None]:
hidden_size = 100
learning_rate = 0.01
cell = "rnn"
n_layers = 2

decoder = CharRNN(
    n_characters,
    hidden_size,
    n_characters,
    model=cell,
    n_layers=n_layers,
)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

if use_cuda:
    decoder.cuda()

In [None]:
n_epochs = 2000
chunk_len = 200
print_every = 100
batch_size = 100

In [None]:
def train(inp, target):
    hidden = decoder.init_hidden(batch_size)
    if use_cuda:
        hidden = hidden.cuda()
    decoder.zero_grad()
    loss = 0

    for c in range(chunk_len):
        output, hidden = decoder(inp[:,c], hidden)
        loss += criterion(output.view(batch_size, -1), target[:,c])
        

    loss.backward()
#    print(type(loss))
    decoder_optimizer.step()
#    print(loss.data, chunk_len)
#    return loss.data[0] / chunk_len
    return loss.item() / chunk_len

# Generate Text

In [None]:
def generate(decoder, prime_str='A', predict_len=100, temperature=0.8, cuda=False):
    hidden = decoder.init_hidden(1)
    prime_input = Variable(char_tensor(prime_str).unsqueeze(0))

    if cuda:
        hidden = hidden.cuda()
        prime_input = prime_input.cuda()
    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden = decoder(prime_input[:,p], hidden)
        
    inp = prime_input[:,-1]
    
    for p in range(predict_len):
        output, hidden = decoder(inp, hidden)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]

        # Add predicted character to string and use as next input
        predicted_char = all_characters[top_i]
        predicted += predicted_char
        inp = Variable(char_tensor(predicted_char).unsqueeze(0))
        if cuda:
            inp = inp.cuda()

    return predicted

In [None]:
start = time.time()
all_losses = []
loss_avg = 0

print("Training for %d epochs..." % n_epochs)
for epoch in tqdm(range(1, n_epochs + 1)):
    loss = train(*random_training_set(chunk_len, batch_size, data))
    loss_avg += loss

    if epoch % print_every == 0:
        print('[%s (%d %d%%) %.4f]' % (time_since(start), epoch, epoch / n_epochs * 100, loss))
        print('loss: ', loss)
        print(generate(decoder, 'Wh', 100, cuda=use_cuda), '\n')

In [None]:
torch.save(decoder,'rnnTrainedModel.pth')


### Let's try sampling with high temperature:

In [None]:
generate(decoder, prime_str="A", temperature= 100, cuda=use_cuda)

### Let's try sampling with low temperature:

In [None]:
generate(decoder, prime_str="A", temperature= 0.5, cuda=use_cuda)

### Describe the difference
How do the samples qualitatively change? What does changing the temperature do to distribution of possible outputs?¶


### Starting Prompts
Explain how the model uses the `prime_str` to generate new text. Find a good prime string and display the results. 

In [None]:
generate(decoder, prime_str="why", cuda=use_cuda)

#Cell Types

### LSTM Gates
Explain the role of the LSTM gates (you may this article useful : https://colah.github.io/posts/2015-08-Understanding-LSTMs/


$i_t = \sigma(W_{ii} x_t + b_{ii} + W_{hi} h_{(t-1)} + b_{hi}) \\
f_t = \sigma(W_{if} x_t + b_{if} + W_{hf} h_{(t-1)} + b_{hf}) \\
g_t = \tanh(W_{ig} x_t + b_{ig} + W_{hg} h_{(t-1)} + b_{hg}) \\
o_t = \sigma(W_{io} x_t + b_{io} + W_{ho} h_{(t-1)} + b_{ho}) \\
c_t = f_t c_{(t-1)} + i_t g_t \\
h_t = o_t \tanh(c_t)$


here


### Explain how LSTM Cell is different than Simple RNN? (why is it better or worse?)

here


###Train CharRNN with your LSTM cell. Compare the results to the Simple rnn.