In [1]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
with open('../Data/shakespeare.txt','r',encoding='utf8') as f:
    text = f.read()

In [4]:
type(text)

str

In [5]:
print(text[:1000])


                     1
  From fairest creatures we desire increase,
  That thereby beauty's rose might never die,
  But as the riper should by time decease,
  His tender heir might bear his memory:
  But thou contracted to thine own bright eyes,
  Feed'st thy light's flame with self-substantial fuel,
  Making a famine where abundance lies,
  Thy self thy foe, to thy sweet self too cruel:
  Thou that art now the world's fresh ornament,
  And only herald to the gaudy spring,
  Within thine own bud buriest thy content,
  And tender churl mak'st waste in niggarding:
    Pity the world, or else this glutton be,
    To eat the world's due, by the grave and thee.


                     2
  When forty winters shall besiege thy brow,
  And dig deep trenches in thy beauty's field,
  Thy youth's proud livery so gazed on now,
  Will be a tattered weed of small worth held:  
  Then being asked, where all thy beauty lies,
  Where all the treasure of thy lusty days;
  To say within thine own deep su

In [6]:
len(text)

5445609

In [7]:
all_characters = set(text)

In [8]:
len(all_characters)

84

In [9]:
# number of value ---> letter
decoder = dict(enumerate(all_characters))

In [10]:
#decoder

In [11]:
# letter ---> corresponding number
encoder = {char: idx for idx,char in decoder.items()}

In [12]:
#encoder

In [13]:
encoded_text = np.array([encoder[char] for char in text])

In [14]:
encoded_text[:500]

array([25, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
       20, 20, 20, 20, 20, 53, 25, 20, 20, 27, 51, 58, 42, 20, 26, 43,  2,
       51,  8, 28, 68, 20, 34, 51,  8, 43, 68, 21, 51,  8, 28, 20, 78,  8,
       20, 10,  8, 28,  2, 51,  8, 20,  2, 44, 34, 51,  8, 43, 28,  8, 52,
       25, 20, 20,  7, 16, 43, 68, 20, 68, 16,  8, 51,  8, 63, 74, 20, 63,
        8, 43, 21, 68, 74, 64, 28, 20, 51, 58, 28,  8, 20, 42,  2,  3, 16,
       68, 20, 44,  8, 72,  8, 51, 20, 10,  2,  8, 52, 25, 20, 20,  0, 21,
       68, 20, 43, 28, 20, 68, 16,  8, 20, 51,  2,  1,  8, 51, 20, 28, 16,
       58, 21, 79, 10, 20, 63, 74, 20, 68,  2, 42,  8, 20, 10,  8, 34,  8,
       43, 28,  8, 52, 25, 20, 20, 83,  2, 28, 20, 68,  8, 44, 10,  8, 51,
       20, 16,  8,  2, 51, 20, 42,  2,  3, 16, 68, 20, 63,  8, 43, 51, 20,
       16,  2, 28, 20, 42,  8, 42, 58, 51, 74, 57, 25, 20, 20,  0, 21, 68,
       20, 68, 16, 58, 21, 20, 34, 58, 44, 68, 51, 43, 34, 68,  8, 10, 20,
       68, 58, 20, 68, 16

In [15]:
decoder[43]

'a'

In [16]:
def one_hot_encoder(encoded_text, num_uni_chars):
    # encoded text ---> batch of encoded text
    # num_uni_chars ---> len(set(text))

    one_hot = np.zeros((encoded_text.size, num_uni_chars))

    # convert data type
    one_hot = one_hot.astype(np.float32)

    # set values to 1 for each corresponding character (i.e. 0, 0, 0, 0, ..., 1(43rd position), 0, 0, ...) for ' ' (space)
    one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0

    one_hot = one_hot.reshape((*encoded_text.shape, num_uni_chars))

    return one_hot

In [17]:
arr = np.array([1,2,0])

In [18]:
arr

array([1, 2, 0])

In [19]:
# for idx 1 of row 1 -> 1, idx 2 of row 2 -> 1, and idx 0 of row 2 -> 1
one_hot_encoder(arr, 3)

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [20]:
example_text = np.arange(10)

In [21]:
example_text

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [22]:
example_text.reshape((5,-1))

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

In [28]:
def generate_batches(encoded_text, samp_per_batch=10, seq_len=50):
    # X: encoded text of length seq_len
    # ex: [0, 1, 2]
    #     [1, 2, 3]
    
    # Y: encoded text shifted by one
    # ex: [1, 2, 3]
    #     [2, 3, 4]
    
    char_per_batch = samp_per_batch * seq_len # total number of chars per batch
    num_batches_avail = int(len(encoded_text)/char_per_batch) # how many batches can we make

    # cut off the end of the encoded text, that won't fit evenly into a batch (remove last few chars, up to 49)
    encoded_text = encoded_text[:num_batches_avail*char_per_batch]

    encoded_text = encoded_text.reshape((samp_per_batch,-1))

    for n in range(0, encoded_text.shape[1], seq_len):
        x = encoded_text[:,n:n+seq_len]

        # zero array to the same shape as x
        y = np.zeros_like(x)

        try:
            y[:,:-1] = x[:,1:]
            y[:,-1] = encoded_text[:,n+seq_len]
        except:
            y[:,:-1] = x[:,1:]
            y[:,-1] = encoded_text[:,0]

        yield x,y

In [33]:
sample_text = np.arange(20)

In [34]:
sample_text

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [35]:
batch_generator = generate_batches(sample_text, samp_per_batch=2, seq_len=5)

In [36]:
x,y = next(batch_generator)

In [37]:
x

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14]])

In [38]:
y

array([[ 1,  2,  3,  4,  5],
       [11, 12, 13, 14, 15]])

In [46]:
class CharModel(nn.Module):
    def __init__(self, all_chars, num_hidden=256, num_layers=4, drop_prob=0.5, use_gpu=False):
        super().__init__()
        self.drop_prob = drop_prob
        self.num_layers = num_layers
        self.num_hidden = num_hidden
        self.use_gpu = use_gpu

        self.all_chars = all_chars
        self.decoder = dict(enumerate(all_chars))
        self.encoder = {char:idx for idx, char in decoder.items()}

        # use batch_first=True to match the format as batch generator
        self.lstm = nn.LSTM(len(self.all_chars), num_hidden, num_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc_linear = nn.Linear(num_hidden, len(self.all_chars))

    def forward(self, x, hidden):
        lstm_output, hidden = self.lstm(x,hidden)
        drop_output = self.dropout(lstm_output)
        drop_output = drop_output.contiguous().view(-1,self.num_hidden) # reshaping
        final_out = self.fc_linear(drop_output)

        return final_out, hidden

    def hidden_state(self, batch_size):
        if self.use_gpu:
            hidden = (torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda(),
                torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda())
        else:
            hidden = (torch.zeros(self.num_layers, batch_size, self.num_hidden),
                torch.zeros(self.num_layers, batch_size, self.num_hidden))

        return hidden

In [47]:
model = CharModel(all_chars=all_characters, num_hidden=512, num_layers=3, drop_prob=0.5, use_gpu=True)

In [48]:
total_param = []
for p in model.parameters():
    total_param.append(int(p.numel()))

In [49]:
sum(total_param)
# try to match size of dataset (at least order of magnitude)
# too many parameters for too small text data -> overfitting
# too less parameters for too much text data -> underfitting

5470292

In [50]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [51]:
train_percent = 0.1

In [52]:
train_idx = int(len(encoded_text)*train_percent)

In [53]:
train_data = encoded_text[:train_idx]
val_data = encoded_text[train_idx:]

In [54]:
len(train_data)

544560

In [55]:
len(val_data)

4901049

In [56]:
train_percent = 0.9 # majority of data will be used for training

In [59]:
train_idx = int(len(encoded_text) * train_percent)

In [60]:
train_data = encoded_text[:train_idx]
val_data = encoded_text[train_idx:]