In [27]:
import numpy as np
import pandas as pd
import torch
from torch import nn, optim

datapath = "../data/"

In [262]:
with open(datapath + 'anna.txt', 'r') as f:
    text = f.read()

#print(text[:10])

chars = tuple(set(text))
int2char = dict(enumerate(chars)) # map integers to unique characters
char2int = {ch: ii for ii, ch in int2char.items()} # map characters into unique integers

#np.array([ char2int[ch] for ch in text[:100] ])

def int_to_char(arr, int2char):
    return np.array([ int2char[ele] for ii, ele in enumerate(arr.ravel()) ]).reshape(*arr.shape)
def char_to_int(arr, char2int):
    return np.array([ char2int[ele] for ii, ele in enumerate(arr.ravel()) ]).reshape(*arr.shape)

def one_hot_encode(arr, n_labels):
    
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)

    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1

    one_hot = np.reshape(one_hot, (*arr.shape, n_labels))

    return one_hot

def get_batches(arr, batch_size, seq_length, encode=None):
    '''Create a generator that returns batches of size
       batch_size x seq_length from arr.
       
       Arguments
       ---------
       arr: Array you want to make batches from
       batch_size: Batch size, the number of sequences per batch
       seq_length: Number of encoded chars in a sequence
    '''
    
    ## Get the number of batches we can make
    n_batches = np.floor(arr.size/(batch_size*seq_length)).astype(np.int32)
    
    ## Keep only enough characters to make full batches
    arr = arr[:n_batches*batch_size*seq_length]
    
    ## Reshape into batch_size rows
    arr = arr.reshape((batch_size, -1))
    
    ## Iterate over the batches using a window of size seq_length
    n_labels = np.unique(arr).size
    for n in range(0, arr.shape[1], seq_length):
        # The features
        x = arr[:,n:n+seq_length]
        # The targets, shifted by one
        if n+seq_length-arr.shape[1] < 0:
            y = arr[:,n+1:n+seq_length+1] 
        else: 
            y = np.concatenate((arr[:,n+1:n+seq_length], arr[:,0][:,np.newaxis]), axis=1)   
        if encode == 'one-hot':
            x = one_hot_encode(x, n_labels)
            y = one_hot_encode(y, n_labels)     
        yield x, y

## test implementations
text_arr = np.array([ char2int[ch] for ch in text ])
print(text_arr.shape, len(text))

#xy_onehot = get_batches(text_arr, 8, 50, encode='one-hot')
#print(next(iter(xy_onehot))[0].shape)

xy_int = get_batches(text_arr, 128, 100)
for ii, xy in enumerate(xy_int):
    xy3 = np.stack((xy[0], xy[1]), axis=2)
    # next, iterate over batches
    for jj, xy in enumerate(xy3):
        x = xy[:,0]
        y = xy[:,1]
        #print(x.shape)
        input = ''
        target = ''
        for id in range(x.size):
            input += int2char[x[id]]
            target += int2char[y[id]]
        #print("input x: " + input)
        #print("target y: " + target)


(1985223,) 1985223
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
6

In [21]:
## Now define the neural network:
# check if GPU is available
torch.cuda.device_count()

class CharRNN(nn.Module):
    
    def __init__(self, tokens, n_hidden=256, n_layers=2,
                               drop_prob=0.5, lr=0.001):
        super().__init__()
        self.chars = tokens
        self.int2char = dict(enumerate(self.chars))
        self.char2int = {ch: ii for ii, ch in self.int2char.items()}
        
        self.n_hidden = n_hidden
        self.n_layers = n_layers
        self.drop_prob = drop_prob
        self.learning_rate = lr
        
        ## define the layers of the model
        self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(p=drop_prob)
        self.fc = nn.Linear(n_hidden, len(self.chars))
    
    def forward(self, x, hidden):
        ''' Forward pass through the network. 
            These inputs are x, and the hidden/cell state `hidden`. '''
                
        ## Get the outputs and the new hidden state from the lstm
        out, hidden = self.lstm(x, hidden)
        out = self.dropout(out)
        out = out.contiguous().view(-1, self.n_hidden)
        out = self.fc(out)
        
        # return the final output and the hidden state
        return out, hidden
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        if torch.cuda.device_count() > 0:
            return (torch.from_numpy(np.zeros((self.n_layers, batch_size, self.n_hidden), dtype=np.float32)).cuda(),
                    torch.from_numpy(np.zeros((self.n_layers, batch_size, self.n_hidden), dtype=np.float32)).cuda())
        else:
            return (torch.from_numpy(np.zeros((self.n_layers, batch_size, self.n_hidden), dtype=np.float32)),
                    torch.from_numpy(np.zeros((self.n_layers, batch_size, self.n_hidden), dtype=np.float32)))

In [294]:
## Now start training the network
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: TEXT data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    '''

    net.train()
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr = lr)

    # create training and validation data
    val_idx = int(len(data)*(1-val_frac))
    train_data, val_data = data[:val_idx], data[val_idx:]

    train_on_gpu = torch.cuda.device_count() > 0

    if train_on_gpu:
        net.cuda()

    n_chars = len(net.chars)
    counter = 0
    for e in range(epochs):
        # initialize hidden state
        h = net.init_hidden(batch_size)

        for x, y in get_batches(train_data, batch_size, seq_length):
            counter += 1
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

            if train_on_gpu:
                inputs, targets = inputs.cuda(), targets.cuda()

            # creating new variables for the hidden state, OW backprop thru the entire training history
            h = tuple([each.detach() for each in h])
    
            net.zero_grad()
    
            output, h = net(inputs, h)
    
            loss = criterion(output, targets.view(batch_size*seq_length).long())
            loss.backward()
    
            # apply clip_grad_norm, to prevent gradient explosion
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            optimizer.step()
    
            # monitor loss
            if counter % print_every == 0:
                # validation loss
                h_val = net.init_hidden(batch_size)
                losses_val = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    x = one_hot_encode(x, n_chars)
                    inputs_val, targets_val = torch.from_numpy(x), torch.from_numpy(y)
    
                    h_val = tuple([each.detach() for each in h_val])
    
                    if train_on_gpu:
                        inputs_val, targets_val = inputs_val.cuda(), targets_val.cuda()
                    
                    output_val, h_val = net(inputs_val, h_val)
                    loss_val = criterion(output_val, targets_val.view(batch_size*seq_length).long())
                    losses_val.append(loss_val.item())

                net.train()
    
                print("Epoch: {}/{}...".format(e+1,epochs),
                     "Step: {}...".format(counter),
                     "Loss: {:.4f}...".format(loss.item()),
                     "Val Loss: {:.4f}".format(np.mean(losses_val)))

n_hidden = 512
n_layers = 2
batch_size = 128
seq_length = 100
n_epochs = 20

net = CharRNN(chars, n_hidden, n_layers)
print(net)

# Now train the model
train(net, text_arr, epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.001, print_every=50)

CharRNN(
  (lstm): LSTM(83, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=512, out_features=83, bias=True)
)
Epoch: 1/20... Step: 50... Loss: 3.1432... Val Loss: 3.1177
Epoch: 1/20... Step: 100... Loss: 3.0802... Val Loss: 3.0683
Epoch: 2/20... Step: 150... Loss: 2.6425... Val Loss: 2.5976
Epoch: 2/20... Step: 200... Loss: 2.4445... Val Loss: 2.4050
Epoch: 2/20... Step: 250... Loss: 2.2718... Val Loss: 2.2699
Epoch: 3/20... Step: 300... Loss: 2.1848... Val Loss: 2.1545
Epoch: 3/20... Step: 350... Loss: 2.0998... Val Loss: 2.0578
Epoch: 3/20... Step: 400... Loss: 1.9906... Val Loss: 1.9787
Epoch: 4/20... Step: 450... Loss: 1.8900... Val Loss: 1.9084
Epoch: 4/20... Step: 500... Loss: 1.8977... Val Loss: 1.8532
Epoch: 4/20... Step: 550... Loss: 1.8563... Val Loss: 1.8030
Epoch: 5/20... Step: 600... Loss: 1.7678... Val Loss: 1.7605
Epoch: 5/20... Step: 650... Loss: 1.7231... Val Loss: 1.7181
Epoch: 6/20... Step: 700.

In [296]:
# save the model
model_name = 'rnn_20_epoch.net'

checkpoint = {'n_hidden': net.n_hidden,
              'n_layers': net.n_layers,
              'state_dict': net.state_dict(),
              'tokens': net.chars}

with open(model_name, 'wb') as f_mod:
    torch.save(checkpoint, f_mod)

In [292]:
hd = net.init_hidden(batch_size)
hd[0].data

tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]], device='cuda:0')

In [None]:
# Making predictions
