In [2]:
import torch
import numpy as np
from torch import nn
import torch.nn.functional as F

### Load data

In [3]:
with open('data/anna.txt','r') as f:
    text = f.read()

In [4]:
text[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

### tokenization

In [5]:
# char to int
# int to char
chars = tuple(set(text))
int2char = dict(enumerate(chars))
char2int =  {ch : ii for ii, ch in int2char.items()}

In [9]:
char2int

{'$': 0,
 '0': 1,
 'R': 2,
 '`': 3,
 'L': 4,
 'C': 5,
 'B': 6,
 'E': 7,
 '&': 8,
 'p': 9,
 ' ': 10,
 'f': 11,
 '"': 12,
 ',': 13,
 'Y': 14,
 '\n': 15,
 'S': 16,
 '1': 17,
 'k': 18,
 ';': 19,
 'n': 20,
 '5': 21,
 '6': 22,
 'y': 23,
 '8': 24,
 'w': 25,
 'J': 26,
 'T': 27,
 'q': 28,
 'W': 29,
 ')': 30,
 '4': 31,
 '(': 32,
 'D': 33,
 'o': 34,
 'j': 35,
 'm': 36,
 '.': 37,
 'F': 38,
 'h': 39,
 'c': 40,
 "'": 41,
 'd': 42,
 's': 43,
 '9': 44,
 'X': 45,
 'Q': 46,
 '%': 47,
 'x': 48,
 '*': 49,
 '?': 50,
 'G': 51,
 'V': 52,
 'v': 53,
 'N': 54,
 ':': 55,
 't': 56,
 'H': 57,
 'M': 58,
 'l': 59,
 'U': 60,
 '7': 61,
 'a': 62,
 'i': 63,
 'b': 64,
 '/': 65,
 'g': 66,
 '3': 67,
 'r': 68,
 'K': 69,
 'A': 70,
 'u': 71,
 '2': 72,
 '@': 73,
 'Z': 74,
 'e': 75,
 'O': 76,
 '-': 77,
 'I': 78,
 'z': 79,
 'P': 80,
 '_': 81,
 '!': 82}

In [10]:
#encode the text
encoded = np.array([char2int[ch] for ch in text])

In [11]:
encoded

array([ 5, 39, 62, ..., 43, 37, 15])

In [12]:
encoded[:100]

array([ 5, 39, 62,  9, 56, 75, 68, 10, 17, 15, 15, 15, 57, 62,  9,  9, 23,
       10, 11, 62, 36, 63, 59, 63, 75, 43, 10, 62, 68, 75, 10, 62, 59, 59,
       10, 62, 59, 63, 18, 75, 19, 10, 75, 53, 75, 68, 23, 10, 71, 20, 39,
       62,  9,  9, 23, 10, 11, 62, 36, 63, 59, 23, 10, 63, 43, 10, 71, 20,
       39, 62,  9,  9, 23, 10, 63, 20, 10, 63, 56, 43, 10, 34, 25, 20, 15,
       25, 62, 23, 37, 15, 15,  7, 53, 75, 68, 23, 56, 39, 63, 20])

### preprocessing

In [13]:
def one_hot_encode(arr, n_labels):
    one_hot = np.zeros((np.multiply(*arr.shape),n_labels),dtype=np.float32)
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()]=1.
    one_hot = one_hot.reshape((*arr.shape, n_labels))
    
    return one_hot

In [14]:
test_seq = np.array([[3,5,1]])
one_hot = one_hot_encode(test_seq,8)
print(one_hot)

[[[0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


### making training mini-batches

#### Creating Batches
- 각 배치 N x M char, N is batch isze, M is the seq_len or time steps in seq
- total num of batches, K, 즉 arr는 N x M x k

#### split arr into N batches

In [18]:
def get_batches(arr, batch_size, seq_length):
    #arr : 배치를 만들 배열
    #batch_size : 배치 크기
    #seq_lenght : 시퀀스에서 인코딩한 문자열 수
    
    batch_size_total = batch_size * seq_length
    n_batches = len(arr) // batch_size_total
    arr = arr[:n_batches*batch_size_total]
    arr = arr.reshape((batch_size,-1))
    
    for n in range(0, arr.shape[1],seq_length):
        x = arr[:,n:n+seq_length]
        y = np.zeros_like(x) # x와 y 같은 사이즈로
        try:
            y[:,:-1],y[:,-1] = x[:,1:],arr[:,n + seq_length]
        except IndexError:
            y[:,:-1],y[:,-1] = x[:,1:],arr[:,0]
        yield x,y

### test

In [19]:
batches = get_batches(encoded,8,50)
x,y = next(batches)

In [20]:
print('x\n',x[:10,:10])
print('\ny\n', y[:10,:10])

x
 [[ 5 39 62  9 56 75 68 10 17 15]
 [43 34 20 10 56 39 62 56 10 62]
 [75 20 42 10 34 68 10 62 10 11]
 [43 10 56 39 75 10 40 39 63 75]
 [10 43 62 25 10 39 75 68 10 56]
 [40 71 43 43 63 34 20 10 62 20]
 [10 70 20 20 62 10 39 62 42 10]
 [76 64 59 34 20 43 18 23 37 10]]

y
 [[39 62  9 56 75 68 10 17 15 15]
 [34 20 10 56 39 62 56 10 62 56]
 [20 42 10 34 68 10 62 10 11 34]
 [10 56 39 75 10 40 39 63 75 11]
 [43 62 25 10 39 75 68 10 56 75]
 [71 43 43 63 34 20 10 62 20 42]
 [70 20 20 62 10 39 62 42 10 43]
 [64 59 34 20 43 18 23 37 10 12]]


In [23]:
y[:,-1]

array([39, 12, 20, 10, 64, 75, 43, 75])

In [31]:
encoded[:50]

array([ 5, 39, 62,  9, 56, 75, 68, 10, 17, 15, 15, 15, 57, 62,  9,  9, 23,
       10, 11, 62, 36, 63, 59, 63, 75, 43, 10, 62, 68, 75, 10, 62, 59, 59,
       10, 62, 59, 63, 18, 75, 19, 10, 75, 53, 75, 68, 23, 10, 71, 20])

In [32]:
train_on_gpu = torch.cuda.is_available()
if(train_on_gpu):
    print('training on gpu')
else :
    print('No Gpu available, training on Cpu')

No Gpu available, training on Cpu


In [51]:
class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.drop_prob = drop_prob
        self.n_layers = n_layers
        self.n_hidden = n_hidden
        self.lr = lr
        
        # creating character dictionaries
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        ## TODO: define the LSTM
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        ## TODO: define a dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        ## TODO: define the final, fully-connected output layer
        self.fc = nn.Linear(n_hidden, len(self.chars))
      
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        ## TODO: Get the outputs and the new hidden state from the lstm
        r_output, hidden = self.lstm(x, hidden)
        
        ## TODO: pass through a dropout layer
        out = self.dropout(r_output)
        
        # Stack up LSTM outputs using view
        # you may need to use contiguous to reshape the output
        out = out.contiguous().view(-1, self.n_hidden)
        
        ## TODO: put x through the fully-connected layer
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x n_hidden,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_())
        
        return hidden

In [52]:
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()
    
    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    
    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    data, val_data = data[:val_idx], data[val_idx:]
    
    if(train_on_gpu):
        net.cuda()
    
    counter = 0
    n_chars = len(net.chars)
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)
        
        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            
            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)
            
            if(train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()
            
            # get the output from the model
            output, h = net(inputs, h)
            
            # calculate the loss and perform backprop
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()
            
            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)
                    
                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])
                    
                    inputs, targets = x, y
                    if(train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(output, targets.view(batch_size*seq_length).long())
                
                    val_losses.append(val_loss.item())
                
                net.train() # reset to train mode after iterationg through validation data
                
                print("Epoch: {}/{}...".format(e+1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

In [53]:
n_hidden=512
n_layers=2

net = CharRNN(chars, n_hidden,n_layers)
print(net)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)


In [55]:
batch_size = 128
seq_length=100
n_epochs=20
train(net,encoded, epochs=n_epochs, batch_size=batch_size,seq_length=seq_length,lr=0.001)

Epoch: 1/20... Step: 10... Loss: 3.1161... Val Loss: 3.1018
Epoch: 1/20... Step: 20... Loss: 3.0473... Val Loss: 3.0642
Epoch: 1/20... Step: 30... Loss: 2.9971... Val Loss: 3.0010
Epoch: 1/20... Step: 40... Loss: 2.8645... Val Loss: 2.8706
Epoch: 1/20... Step: 50... Loss: 2.8470... Val Loss: 2.8109
Epoch: 1/20... Step: 60... Loss: 2.6999... Val Loss: 2.6792
Epoch: 1/20... Step: 70... Loss: 2.6978... Val Loss: 2.5908
Epoch: 1/20... Step: 80... Loss: 2.5557... Val Loss: 2.5465
Epoch: 1/20... Step: 90... Loss: 2.5106... Val Loss: 2.4760
Epoch: 1/20... Step: 100... Loss: 2.4526... Val Loss: 2.4291
Epoch: 1/20... Step: 110... Loss: 2.4050... Val Loss: 2.3943
Epoch: 1/20... Step: 120... Loss: 2.3540... Val Loss: 2.3606
Epoch: 1/20... Step: 130... Loss: 2.3683... Val Loss: 2.3308
Epoch: 2/20... Step: 140... Loss: 2.3404... Val Loss: 2.2991
Epoch: 2/20... Step: 150... Loss: 2.2959... Val Loss: 2.2664
Epoch: 2/20... Step: 160... Loss: 2.2679... Val Loss: 2.2348
Epoch: 2/20... Step: 170... Loss:

KeyboardInterrupt: 

In [None]:
model_name = 'rnn_20_epoch.net'

checkpoint = {'n_hidden':net.n_hidden,
             'n_layers': net.n_layers,
             'state_dict':net.state_dict(),
             'tokens':net.chars}

with open(model_name,'wb') as f:
    torch.save(checkpoint, f)

In [None]:
def predict(net, char, h=None, top_k=None):
    
    x= np.array([[net.char2int[char]]])
    x = one_hot_encode(x,len(net.chars))
    inputs = torch.from_numpy(x)
    
    if(train_on_gpu):
        inputs = inpouts.cuda()
        
    h= tuple([each.data for each in h])
    out,h = net(inputs,h)
    
    p = F.softmax(out,dim=1).data
    if(train_on_gpu):
        p = p.cpu
    
    #get top characters
    if top_k is None:
        top_ch = np.arage(len(net.chars))
    else :
        p,top_ch = p.topk(top_k)
        tio_ch = top_ch.numpy().squeeze()
        
    p=p.numpy().squeeze()
    char = np.random.choice(top_ch, p=p/p.sum())
    
    return net.int2char[char], h

In [None]:
def sample(net, size, prime='The', top_k=None):
        
    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()
    
    net.eval() # eval mode
    
    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)
    
    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [None]:
print(sample(net, 1000, prime='Anna', top_k=5))

### loading a checkpoint

In [None]:
with open('rnn_20_epoch.net', 'rb') as f:
    checkpoint = torch.load(f)
    
loaded = CharRNN(checkpoint['tokens'], n_hidden=checkpoint['n_hidden'], n_layers=checkpoint['n_layers'])
loaded.load_state_dict(checkpoint['state_dict'])

In [None]:
# Sample using a loaded model
print(sample(loaded, 2000, top_k=5, prime="And Levin said"))