#### Character RNN 

In [1]:
import time 
import random 
import unidecode 
import string 
import re 
import matplotlib.pyplot as plt 
import torch 

torch.backends.cudnn.deterministic = True 

In [2]:
RANDOM_SEED= 234 
torch.manual_seed(RANDOM_SEED)
DEVICE= torch.device('cpu')

NUM_ITER= 5000  # epoch size 
LEARNING_RATE = 0.005 
EMBEDDING_DIM= 100  # size of the embedding weight 
HIDDEN_DIM= 100     # size of the hidden layer 
NUM_HIDDEN_LAYER= 1     # number of hidden layer after embedding 

TEXT_PORTION_SIZE= 200  # size of each text 


In [16]:
len(string.printable)    # all the ascii characters this string class can take 

100

In [4]:
with open('covid19-faq.txt', 'r') as f:     # open the file 
    textfile= f.read()  # read the file and load into a variable called textfile 

textfile = unidecode.unidecode(textfile)    # convert all the special characters 

# get rid of all the white spaces 
textfile = re.sub(" +"," ", textfile)

TEXT_LENGTH= len(textfile)  # find the total length of the textfile= text_length

TEXT_LENGTH

84658

In [5]:
random.seed(RANDOM_SEED)    # set the random seed 
# this will lead to weird texts 
def random_portion (textfile):  # sample text randomly 
    start_index= random.randint(0, TEXT_LENGTH-TEXT_PORTION_SIZE)   # randomly sample sentence with TEXT_PORTION_SIZE # of characters in it 
    end_index= start_index + TEXT_PORTION_SIZE + 1 
    return textfile[start_index:end_index]  # return the string of that size 

print(random_portion(textfile))

 to be submitted. In addition, all current campus health protocols must be followed including wearing facial covering. Additional information on the campus travel policy is located here.

Under current


In [6]:
# convert text into torch tensor for training purposes 
def char_to_tensor(text):
    lst= [string.printable.index(c) for c in text]  # convert the text into ASCII characters(ASCII indices) 
    tensor= torch.tensor(lst).long()   # convert this into a tensor
    return tensor 
print(char_to_tensor("abcDEF")) 

tensor([10, 11, 12, 39, 40, 41])


In [7]:
def draw_random_sample(textfile):
    text_long = char_to_tensor(random_portion(textfile))    # randomly get text and convert into torch tensor 
    input= text_long[:-1]   # input is every letter besides last one 
    targets= text_long[1:]   # output starts from index 1, we input one and predict the next one (that's why we cut off the last letter of )
    return input, targets 

In [8]:
draw_random_sample(textfile)

(tensor([32, 24, 94, 13, 10, 34, 28, 94, 28, 18, 23, 12, 14, 94, 29, 17, 14, 18,
         27, 94, 21, 10, 28, 29, 94, 14, 25, 18, 28, 24, 13, 14, 94, 24, 15, 94,
         31, 24, 22, 18, 29, 18, 23, 16, 94, 24, 27, 94, 13, 18, 10, 27, 27, 17,
         14, 10, 78, 94, 18, 15, 94, 29, 17, 14, 34, 94, 17, 10, 31, 14, 94, 11,
         14, 14, 23, 94, 24, 23, 94, 10, 23, 29, 18, 11, 18, 24, 29, 18, 12, 94,
         15, 24, 27, 94, 10, 29, 94, 21, 14, 10, 28, 29, 94,  2,  4, 94, 17, 24,
         30, 27, 28, 94, 18, 15, 94, 25, 27, 14, 28, 12, 27, 18, 11, 14, 13, 78,
         94, 24, 27, 94, 10, 28, 94, 10, 25, 25, 27, 24, 31, 14, 13, 94, 29, 24,
         94, 32, 24, 27, 20, 94, 11, 34, 94, 10, 94, 13, 24, 12, 29, 24, 27, 75,
         96, 96, 48, 24, 28, 29, 94, 14, 22, 25, 21, 24, 34, 14, 14, 28, 94, 32,
         18, 21, 21, 94, 11, 14, 94, 10, 11, 21, 14, 94, 29, 24, 94, 27, 14, 29,
         30, 27]),
 tensor([24, 94, 13, 10, 34, 28, 94, 28, 18, 23, 12, 14, 94, 29, 17, 14, 18, 27,
         

#### Model 

In [9]:
class RNN(torch.nn.Module):
    def __init__ (self, input_size, embed_size, hidden_size, output_size, num_layers):
        super().__init__() 
        self.hidden_size= hidden_size
        self.num_layers= num_layers 
        self.embed= torch.nn.Embedding(num_embeddings=input_size,embedding_dim=embed_size) # embedding matrix dimension (input size x embedding size(size of dictionary))
        # embedding: (batch_szie x 1 x input size) x (input_size x embedding size)= batch_size x 1 x input_size x embedding size
        self.rnn= torch.nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers)  # input is embedding size with certain hidden_size and # of hidden layers 
        # LSTM: (1 x embedding) x (embedding x hidden) = batch_size x 1 x hidden_size  
        self.fc= torch.nn.Linear(hidden_size, output_size)  # output into the certain size 
    
    # forward pass, pass in the number of features, and the initial hidden and cell states 
    def forward(self, features, hidden_cell_state): 
        # input will be batch_size x 1 for a single character (represented in a single integer)
        
        # features will be a dimension of [[1]] (features is 1 character size)
        features= features.view(1,-1) 
        
        embedded= self.embed(features)  # embedding = 1 x embedding_size 

        output, hidden_cell_state= self.rnn(embedded, hidden_cell_state)
        # output dim: 


### LSTM Cell Model

In [10]:
class RNN_Cell(torch.nn.Module):
    # input_size is the vocabulary size (all inputs will be padded into whole vocabulary size)
    # output_size is also the vocabulary size
    def __init__(self, input_size, embed_size, hidden_size, output_size):
        super(). __init__() 
        self.hidden_size= hidden_size
        self.embed= torch.nn.Embedding(num_embeddings=input_size, embedding_dim=embed_size) # embed the input 
        self.rnn= torch.nn.LSTMCell(input_size= embed_size, hidden_size=hidden_size)    # hidden layers 
        self.fc= torch.nn.Linear(hidden_size, output_size)  # fully connected layer 
    
    def forward(self, character, hidden, cell_state):
        # input are characters, so batch_size x 1 

        embedded= self.embed(character)
        (hidden, cell_state)= self.rnn(embedded, (hidden, cell_state))  # LSTM detects the input size from embedded, need intial hidden & cell state as the starting point
        # output dim: batch_size x output_size 
        # hidden dimension: batch_size x hidden_dim
        # cell/hidden cell dim: batch_size x hidden_dim 
        
        # LSTM cell outputs the next layer's hidden and cell state, we only care about the hidden state
        output= self.fc(hidden)
        return output, hidden, cell_state
    
    def init_zero_state(self):  # initialize the original hidden and cell state 
        hidden= torch.zeros(1,self.hidden_size).to(DEVICE)  # hidden is 1 x hidden_size because batch size is 1 
        # batch_size x hidden_size 
        cell= torch.zeros(1,self.hidden_size).to(DEVICE)    

        # initial hidden and cell states are 0s 
        return (hidden,cell)

In [11]:
torch.manual_seed(RANDOM_SEED)
model= RNN_Cell(len(string.printable), EMBEDDING_DIM, HIDDEN_DIM, len(string.printable))
model= model.to(DEVICE)
optimizer= torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [None]:
def evaluate(model, prime_str='A', predict_len=100, temperature=0.8):
    ## based on https://github.com/spro/practical-pytorch/
    ## blob/master/char-rnn-generation/char-rnn-generation.ipynb

    (hidden, cell_state) = model.init_zero_state()
    prime_input = char_to_tensor(prime_str)
    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        inp = prime_input[p].unsqueeze(0)
        _, hidden, cell_state = model(inp.to(DEVICE), hidden, cell_state)
    inp = prime_input[-1].unsqueeze(0)
    
    for p in range(predict_len):

        outputs, hidden, cell_state = model(inp.to(DEVICE), hidden, cell_state)
        
        # Sample from the network as a multinomial distribution
        output_dist = outputs.data.view(-1).div(temperature).exp() # e^{logits / T}
        top_i = torch.multinomial(output_dist, 1)[0]
        
        # Add predicted character to string and use as next input
        predicted_char = string.printable[top_i]
        predicted += predicted_char
        inp = char_to_tensor(predicted_char)

    return predicted

In [None]:
start= time.time() 
for iter in range (NUM_ITER):
    hidden, cell_state= model.init_zero_state() 
    optimizer.zero_grad()   # clear out gradients 
    loss=0 
    inputs, targets= draw_random_sample(textfile)
    inputs, targets= inputs.to(DEVICE), targets.to(DEVICE)

    for c in range(TEXT_PORTION_SIZE):  # input one character at a time and comapre with the actual
        outputs, hidden, cell_state= model(inputs[c].unsqueeze(0), hidden, cell_state)      # keep in track of hidden and cell_state for the next character        
        loss+= torch.nn.functional.cross_entropy(outputs, targets[c].view(1))
    loss /= TEXT_PORTION_SIZE   # find the mean loss 
    loss.backward() # compute the gradient 

    optimizer.step() 

    with torch.no_grad():
        if iter % 200 ==0:
            print(f"Time elapsed: {(time.time() - start)/60:.2f} min")
            print(f"Iteration: {iter}  | Loss: {loss:.3f}")
