In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
with open('../Data/shakespeare.txt','r',encoding='utf8') as f:
    text = f.read()

In [4]:
type(text)

str

In [5]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

In [6]:
len(text)

5445609

In [7]:
all_characters = set(text)

In [8]:
len(all_characters)

84

In [9]:
# number of value ---> letter
decoder = dict(enumerate(all_characters))

In [10]:
#decoder

In [11]:
# letter ---> corresponding number
encoder = {char: idx for idx,char in decoder.items()}

In [12]:
#encoder

In [13]:
encoded_text = np.array([encoder[char] for char in text])

In [14]:
encoded_text[:500]

array([41, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
       26, 26, 26, 26, 26,  2, 41, 26, 26,  5, 74, 83, 47, 26, 28, 67, 38,
       74,  7, 78,  3, 26, 14, 74,  7, 67,  3, 75, 74,  7, 78, 26, 39,  7,
       26, 73,  7, 78, 38, 74,  7, 26, 38, 48, 14, 74,  7, 67, 78,  7, 30,
       41, 26, 26, 52, 17, 67,  3, 26,  3, 17,  7, 74,  7, 53, 11, 26, 53,
        7, 67, 75,  3, 11, 55, 78, 26, 74, 83, 78,  7, 26, 47, 38, 51, 17,
        3, 26, 48,  7, 27,  7, 74, 26, 73, 38,  7, 30, 41, 26, 26, 80, 75,
        3, 26, 67, 78, 26,  3, 17,  7, 26, 74, 38, 42,  7, 74, 26, 78, 17,
       83, 75, 77, 73, 26, 53, 11, 26,  3, 38, 47,  7, 26, 73,  7, 14,  7,
       67, 78,  7, 30, 41, 26, 26,  6, 38, 78, 26,  3,  7, 48, 73,  7, 74,
       26, 17,  7, 38, 74, 26, 47, 38, 51, 17,  3, 26, 53,  7, 67, 74, 26,
       17, 38, 78, 26, 47,  7, 47, 83, 74, 11, 33, 41, 26, 26, 80, 75,  3,
       26,  3, 17, 83, 75, 26, 14, 83, 48,  3, 74, 67, 14,  3,  7, 73, 26,
        3, 83, 26,  3, 17

In [15]:
decoder[43]

'q'

In [16]:
def one_hot_encoder(encoded_text, num_uni_chars):
    # encoded text ---> batch of encoded text
    # num_uni_chars ---> len(set(text))

    one_hot = np.zeros((encoded_text.size, num_uni_chars))

    # convert data type
    one_hot = one_hot.astype(np.float32)

    # set values to 1 for each corresponding character (i.e. 0, 0, 0, 0, ..., 1(43rd position), 0, 0, ...) for ' ' (space)
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0

    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))

    return one_hot

In [17]:
arr = np.array([1,2,0])

In [18]:
arr

array([1, 2, 0])

In [19]:
# for idx 1 of row 1 -> 1, idx 2 of row 2 -> 1, and idx 0 of row 2 -> 1
one_hot_encoder(arr, 3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [20]:
example_text = np.arange(10)

In [21]:
example_text

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [22]:
example_text.reshape((5,-1))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [23]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    # X: encoded text of length seq_len
    # ex: [0, 1, 2]
    #     [1, 2, 3]
    
    # Y: encoded text shifted by one
    # ex: [1, 2, 3]
    #     [2, 3, 4]
    
    char_per_batch = samp_per_batch * seq_len # total number of chars per batch
    num_batches_avail = int(len(encoded_text)/char_per_batch) # how many batches can we make

    # cut off the end of the encoded text, that won't fit evenly into a batch (remove last few chars, up to 49)
    encoded_text = encoded_text[:num_batches_avail*char_per_batch]

    encoded_text = encoded_text.reshape((samp_per_batch,-1))

    for n in range(0, encoded_text.shape[1], seq_len):
        x = encoded_text[:,n:n+seq_len]

        # zero array to the same shape as x
        y = np.zeros_like(x)

        try:
            y[:,:-1] = x[:,1:]
            y[:,-1] = encoded_text[:,n+seq_len]
        except:
            y[:,:-1] = x[:,1:]
            y[:,-1] = encoded_text[:,0]

        yield x,y

In [24]:
sample_text = np.arange(20)

In [25]:
sample_text

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [26]:
batch_generator = generate_batches(sample_text, samp_per_batch=2, seq_len=5)

In [27]:
x,y = next(batch_generator)

In [28]:
x

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14]])

In [29]:
y

array([[ 1,  2,  3,  4,  5],
       [11, 12, 13, 14, 15]])

In [30]:
class CharModel(nn.Module):
    def __init__(self, all_chars, num_hidden=256, num_layers=4, drop_prob=0.5, use_gpu=False):
        super().__init__()
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.use_gpu = use_gpu

        self.all_chars = all_chars
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char:idx for idx, char in decoder.items()}

        # use batch_first=True to match the format as batch generator
        self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))

    def forward(self, x, hidden):
        lstm_output, hidden = self.lstm(x,hidden)
        drop_output = self.dropout(lstm_output)
        drop_output = drop_output.contiguous().view(-1,self.num_hidden) # reshaping
        final_out = self.fc_linear(drop_output)

        return final_out, hidden

    def hidden_state(self, batch_size):
        if self.use_gpu:
            hidden = (torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda(),
                torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers, batch_size, self.num_hidden),
                torch.zeros(self.num_layers, batch_size, self.num_hidden))

        return hidden

In [31]:
model = CharModel(all_chars=all_characters, num_hidden=512, num_layers=3, drop_prob=0.5, use_gpu=True)

In [32]:
total_param = []
for p in model.parameters():
    total_param.append(int(p.numel()))

In [33]:
sum(total_param)
# try to match size of dataset (at least order of magnitude)
# too many parameters for too small text data -> overfitting
# too less parameters for too much text data -> underfitting

5470292

In [34]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [35]:
train_percent = 0.1

In [36]:
train_idx = int(len(encoded_text)*train_percent)

In [37]:
train_data = encoded_text[:train_idx]
val_data = encoded_text[train_idx:]

In [38]:
len(train_data)

544560

In [39]:
len(val_data)

4901049

In [40]:
train_percent = 0.9 # majority of data will be used for training

In [41]:
train_idx = int(len(encoded_text) * train_percent)

In [42]:
train_data = encoded_text[:train_idx]
val_data = encoded_text[train_idx:]

In [49]:
# variables:
epochs = 60
batch_size = 100
seq_len = 100

tracker = 0
num_char = max(encoded_text)+1 # indexing starting from 0

In [54]:
model.train()

if model.use_gpu:
    model.cuda()

for i in range(epochs):
    hidden = model.hidden_state(batch_size)

    for x, y in generate_batches(train_data,batch_size,seq_len):
        tracker += 1
        x = one_hot_encoder(x, num_char)

        inputs =torch.from_numpy(x)
        targets = torch.from_numpy(y)

        if model.use_gpu:
            inputs = inputs.cuda()
            targets = targets.cuda()

        hidden = tuple([state.data for state in hidden])

        model.zero_grad()

        lstm_output, hidden = model.forward(inputs, hidden)
        loss = criterion(lstm_output, targets.view(batch_size*seq_len).long()) # as long to avoid possible data type error
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=5) # to avoid gradient exploding
        optimizer.step()

        if tracker%25 == 0:
            val_hidden = model.hidden_state(batch_size)
            val_losses = []
            model.eval()

            for x, y in generate_batches(val_data, batch_size, seq_len):
                x = one_hot_encoder(x, num_char)

                inputs =torch.from_numpy(x)
                targets = torch.from_numpy(y)
        
                if model.use_gpu:
                    inputs = inputs.cuda()
                    targets = targets.cuda()

                val_hidden = tuple([state.data for state in val_hidden])

                lstm_output, val_hidden = model.forward(inputs, val_hidden)
                val_loss = criterion(lstm_output, targets.view(batch_size*seq_len).long())

                val_losses.append(val_loss.item())

            model.train()

            print(f'epoch: {i} step: {tracker} val loss: {val_loss.item()}')
        

epoch: 0 step: 75 val loss: 3.1928398609161377
epoch: 0 step: 100 val loss: 3.1836488246917725
epoch: 0 step: 125 val loss: 3.0538551807403564
epoch: 0 step: 150 val loss: 2.960284471511841
epoch: 0 step: 175 val loss: 2.842339515686035
epoch: 0 step: 200 val loss: 2.726668357849121
epoch: 0 step: 225 val loss: 2.6494829654693604
epoch: 0 step: 250 val loss: 2.5526108741760254
epoch: 0 step: 275 val loss: 2.429896593093872
epoch: 0 step: 300 val loss: 2.3213629722595215
epoch: 0 step: 325 val loss: 2.2449758052825928
epoch: 0 step: 350 val loss: 2.186616897583008
epoch: 0 step: 375 val loss: 2.1394622325897217
epoch: 0 step: 400 val loss: 2.0962395668029785
epoch: 0 step: 425 val loss: 2.0556678771972656
epoch: 0 step: 450 val loss: 2.026746988296509
epoch: 0 step: 475 val loss: 1.9988107681274414
epoch: 0 step: 500 val loss: 1.9744154214859009
epoch: 0 step: 525 val loss: 1.936339020729065
epoch: 1 step: 550 val loss: 1.9136974811553955
epoch: 1 step: 575 val loss: 1.890352487564087
e

In [55]:
model_name = 'hidden512_layers3_shakes.net'

In [56]:
torch.save(model.state_dict(), model_name)

In [61]:
def predict_next_char(model, char, hidden=None, k=1):
    encoded_text = model.encoder[char]
    encoded_text = np.array([[encoded_text]])

    encoded_text = one_hot_encoder(encoded_text, len(model.all_chars))

    inputs = torch.from_numpy(encoded_text)

    if model.use_gpu:
        inputs = inputs.cuda()

    hidden = tuple([state.data for state in hidden])

    lstm_out, hidden = model(inputs, hidden)

    probs = F.softmax(lstm_out,dim=1).data

    if model.use_gpu:
        probs = probs.cpu()

    probs, idx_positions = probs.topk(k) # k determines how many chars to be considered in this probability
    idx_positions = idx_positions.numpy().squeeze() # squeeze to put in a correct shape

    probs = probs.numpy().flatten()
    probs = probs/probs.sum()

    char = np.random.choice(idx_positions, p=probs) # next char

    return model.decoder[char], hidden

In [63]:
def generate_text(model, size, seed='The', k=1): # different k may print less probable chars (higher chance of typo)
    if model.use_gpu:
        model.cuda()
    else:
        model.cpu()

    model.eval()

    output_chars = [c for c in seed]
    
    hidden = model.hidden_state(1)

    for char in seed:
        char, hidden = predict_next_char(model, char, hidden, k=k)

    output_chars.append(char)

    for i in range(size):
        char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k) # now generating prediction of prediction
        output_chars.append(char)

    return ''.join(output_chars)

In [64]:
print(generate_text(model, 1000, seed='The ', k=3))

The word of the sun too street of a seal of a strong
    dead soul that they will be a sensuble truth in this way.
  Ham. Why, what's your honour too?
  Bene. I will be a stream of all the stars, and I will never say I am
    not a state of man and my son in my heart.
  Bene. What say you to me, sir? Why then, I have not a third that you have, and
    the matter we have any office of the contrary.
  Ham. I am a soldier, some of you.

                       Enter Polonius.

  Prince. The King himself is, there willst thou be thought
    To see my son in the stock of the confines, and
    the servant of the sea of the sun, and the months of the truth
    as they should say to this world and her stones to the court of the
    sea, when his bed and second cheek is three and stops'd to his
    son to the sun and his angel.
  Ham. That should say I am now to think on the way.
  Pedro. Those thoughts to the sea, the storm, and their cause. I will be the
    toothagation to be thine, that the 