<a href="https://colab.research.google.com/github/tejas-srikanth/Shakespeare-text-generator/blob/master/Shakespeare_text_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import matplotlib.pyplot as plt


In [None]:
use_gpu = torch.cuda.is_available()
use_gpu

True

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
root = '/content/gdrive/My Drive/Colab Notebooks/NLP_with_pytorch/Data/shakespeare.txt'

In [None]:
with open(root, 'r', encoding="utf8") as f:
  all_text = f.read()

In [None]:
all_text[:500]

"\n                     1\n  From fairest creatures we desire increase,\n  That thereby beauty's rose might never die,\n  But as the riper should by time decease,\n  His tender heir might bear his memory:\n  But thou contracted to thine own bright eyes,\n  Feed'st thy light's flame with self-substantial fuel,\n  Making a famine where abundance lies,\n  Thy self thy foe, to thy sweet self too cruel:\n  Thou that art now the world's fresh ornament,\n  And only herald to the gaudy spring,\n  Within thine own bu"

# Encode values

In [None]:
all_characters = set(all_text)
decoder = dict(enumerate(all_characters))

In [None]:
encoder = dict((d,idx) for idx,d in decoder.items())
encoder

{'\n': 82,
 ' ': 39,
 '!': 75,
 '"': 12,
 '&': 11,
 "'": 55,
 '(': 61,
 ')': 72,
 ',': 67,
 '-': 0,
 '.': 83,
 '0': 22,
 '1': 27,
 '2': 63,
 '3': 71,
 '4': 6,
 '5': 51,
 '6': 34,
 '7': 59,
 '8': 2,
 '9': 68,
 ':': 21,
 ';': 25,
 '<': 38,
 '>': 18,
 '?': 14,
 'A': 4,
 'B': 77,
 'C': 24,
 'D': 42,
 'E': 64,
 'F': 60,
 'G': 36,
 'H': 35,
 'I': 28,
 'J': 48,
 'K': 47,
 'L': 49,
 'M': 20,
 'N': 56,
 'O': 57,
 'P': 45,
 'Q': 62,
 'R': 15,
 'S': 37,
 'T': 33,
 'U': 81,
 'V': 23,
 'W': 78,
 'X': 8,
 'Y': 30,
 'Z': 58,
 '[': 26,
 ']': 32,
 '_': 9,
 '`': 65,
 'a': 69,
 'b': 44,
 'c': 31,
 'd': 40,
 'e': 17,
 'f': 43,
 'g': 16,
 'h': 13,
 'i': 5,
 'j': 46,
 'k': 54,
 'l': 80,
 'm': 73,
 'n': 66,
 'o': 70,
 'p': 79,
 'q': 1,
 'r': 53,
 's': 76,
 't': 7,
 'u': 3,
 'v': 52,
 'w': 74,
 'x': 29,
 'y': 10,
 'z': 41,
 '|': 19,
 '}': 50}

In [None]:
encoded_text = np.array([encoder[char] for char in all_text])

# One Hot Encoder

In [None]:
def one_hot_encoder(encoded_text, num_unique_chars):

   one_hot = np.zeros((encoded_text.size, num_unique_chars))

   one_hot.astype(np.float32)

   one_hot[np.arange(one_hot.shape[0]), encoded_text.flatten()] = 1.0

   one_hot = one_hot.reshape(*encoded_text.shape, num_unique_chars)

   return one_hot

In [None]:
print(one_hot_encoder(np.array([0]), 3))

[[1. 0. 0.]]


# Batch creator

In [None]:
def create_batches(encoded_text, samp_batch_size, seq_len):
  total_chars_batch = samp_batch_size * seq_len
  num_total_batches = int(len(encoded_text)/total_chars_batch)
  enc_txt = encoded_text[:num_total_batches*total_chars_batch]
  enc_txt = enc_txt.reshape(samp_batch_size, -1)

  for n in range(0, enc_txt.shape[1], seq_len):
    x = enc_txt[:,n:n+seq_len]
    y = np.zeros_like(x)

    try:
      y[:,:-1] = x[:,1:]
      y[:, -1] = enc_txt[:,n+seq_len]
    except:
      y[:,:-1] = x[:,1:]
      y[:, -1] = enc_txt[:,0]
    
    yield x,y


In [None]:
arr = np.arange(30)
next(create_batches(arr, 2, 5))

(array([[ 0,  1,  2,  3,  4],
        [15, 16, 17, 18, 19]]), array([[ 1,  2,  3,  4,  5],
        [16, 17, 18, 19, 20]]))

# Create Model

In [None]:
class Model(nn.Module):

  def __init__(self, all_characters, num_hidden=256, num_layers=4, drop_prob=0.5, use_gpu=False ):

    super().__init__()
    self.num_hidden = num_hidden
    self.num_layers = num_layers
    self.drop_prob = 0.5
    self.use_gpu = use_gpu

    self.all_characters = all_characters
    self.decoder = dict(enumerate(all_characters))
    self.encoder = dict((data, idx) for idx,data in self.decoder.items())

    self.lstm = nn.LSTM(len(all_characters), hidden_size=num_hidden, num_layers=num_layers, batch_first=True, dropout=0.5)
    self.dropout = nn.Dropout(drop_prob)
    self.fc_linear = nn.Linear(num_hidden, len(all_characters))
  
  def forward(self, x, hidden):
    lstm_out, hidden = self.lstm(x, hidden)
    drop_out = self.dropout(lstm_out).contiguous().view(-1, self.num_hidden)
    x_out = self.fc_linear(drop_out)

    return x_out, hidden
  
  def hidden(self, batch_size):
    if self.use_gpu:
      hidden = (torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda(),
                torch.zeros(self.num_layers, batch_size, self.num_hidden).cuda())
    else:
      hidden = (torch.zeros(self.num_layers, batch_size, self.num_hidden),
                torch.zeros(self.num_layers, batch_size, self.num_hidden))
    
    return hidden

In [None]:
model = Model(all_characters, num_hidden=512, num_layers=3, drop_prob=0.5, use_gpu=use_gpu)
if use_gpu:
  model = model.cuda()

# Train Validation Split

In [None]:
train_percentage = 0.9
num_train = int(len(encoded_text) * train_percentage)
train_set = encoded_text[:num_train]
val_set = encoded_text[num_train:]

# Loss and Optimizer

In [None]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Variables

In [None]:
epochs = 20
seq_len = 100
batch_size=128
num_unique=max(encoded_text)+1

tracker=0

# Train the model

In [None]:
model.train()

for i in range(epochs):
  hidden_state = model.hidden(batch_size)

  for x, y in create_batches(train_set, batch_size, seq_len):
    tracker += 1

    x = one_hot_encoder(x, num_unique)

    inputs = torch.from_numpy(x).float()
    target = torch.from_numpy(y)

    if use_gpu:
      inputs = inputs.cuda()
      target = target.cuda()
    
    optimizer.zero_grad()
    hidden_state = tuple([state.data for state in hidden_state])
    output, hidden_state = model.forward(inputs, hidden_state)
    loss = criterion(output, target.view(batch_size*seq_len).long())

    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)

    optimizer.step()

    if tracker % 25 == 0:

      model.eval()

      val_losses = []

      hidden_val = model.hidden(batch_size)

      for x_val, y_val in create_batches(val_set, batch_size, seq_len):
        x_val = one_hot_encoder(x_val, num_unique)

        input_val = torch.from_numpy(x_val).float()
        target_val = torch.from_numpy(y_val)

        if use_gpu:
          input_val = input_val.cuda()
          target_val = target_val.cuda()
        
        optimizer.zero_grad()

        hidden_val = tuple([state.data for state in hidden_val])

        out_val, hidden_val = model(input_val, hidden_val)
        val_loss = criterion(out_val, target_val.view(batch_size*seq_len).long())

      val_losses.append(loss.item())

      print(f'EPOCH: {i+1}    STEP: {tracker}   LOSS: {loss.item()}')

      model.train()

EPOCH: 1    STEP: 25   LOSS: 3.192826509475708
EPOCH: 1    STEP: 50   LOSS: 3.1964797973632812
EPOCH: 1    STEP: 75   LOSS: 3.231184959411621
EPOCH: 1    STEP: 100   LOSS: 3.215151071548462
EPOCH: 1    STEP: 125   LOSS: 3.171006679534912
EPOCH: 1    STEP: 150   LOSS: 3.0044960975646973
EPOCH: 1    STEP: 175   LOSS: 2.9398000240325928
EPOCH: 1    STEP: 200   LOSS: 2.7872278690338135
EPOCH: 1    STEP: 225   LOSS: 2.7979300022125244
EPOCH: 1    STEP: 250   LOSS: 2.648042917251587
EPOCH: 1    STEP: 275   LOSS: 2.5240023136138916
EPOCH: 1    STEP: 300   LOSS: 2.4200057983398438
EPOCH: 1    STEP: 325   LOSS: 2.3248562812805176
EPOCH: 1    STEP: 350   LOSS: 2.239218235015869
EPOCH: 1    STEP: 375   LOSS: 2.21869158744812
EPOCH: 2    STEP: 400   LOSS: 2.1812283992767334
EPOCH: 2    STEP: 425   LOSS: 2.116616725921631
EPOCH: 2    STEP: 450   LOSS: 2.111952304840088
EPOCH: 2    STEP: 475   LOSS: 2.062443733215332
EPOCH: 2    STEP: 500   LOSS: 2.0265390872955322
EPOCH: 2    STEP: 525   LOSS: 2.05

In [None]:
torch.save(model.state_dict(), '/content/gdrive/My Drive/Colab Notebooks/NLP_with_pytorch/512Hidden3Layers.pt')

In [None]:
model = Model(all_characters, num_hidden=512, num_layers=3, drop_prob=0.5, use_gpu=use_gpu)
model.load_state_dict(torch.load('/content/gdrive/My Drive/Colab Notebooks/NLP_with_pytorch/512Hidden3Layers.pt'))

<All keys matched successfully>

In [None]:
def predict_next_char(model, char, hidden=None, k=1):
        
        encoded_text = model.encoder[char]
        encoded_text = np.array([[encoded_text]])
        encoded_text = one_hot_encoder(encoded_text, len(model.all_characters))
        inputs = torch.from_numpy(encoded_text)
        
        if(model.use_gpu):
            inputs = inputs.float().cuda()
        
        hidden = tuple([state.data for state in hidden])
        lstm_out, hidden = model(inputs, hidden)
        probs = F.softmax(lstm_out, dim=1).data
        
        if(model.use_gpu):
            probs = probs.cpu()
        
        probs, index_positions = probs.topk(k)
        index_positions = index_positions.numpy().squeeze()
        probs = probs.numpy().flatten()
        probs = probs/probs.sum()
        char = np.random.choice(index_positions, p=probs)
       
        return model.decoder[char], hidden

In [None]:
def generate_text(model, size, seed='The', k=1):
    
    if(model.use_gpu):
        model.cuda()
    else:
        model.cpu()
    model.eval()
    output_chars = [c for c in seed]
    hidden = model.hidden_state(1)
    
    for char in seed:
        char, hidden = predict_next_char(model, char, hidden, k=k)
    output_chars.append(char)
    
    for i in range(size):
        char, hidden = predict_next_char(model, output_chars[-1], hidden, k=k)
        output_chars.append(char)
    
    return ''.join(output_chars)

In [None]:
shakey = predict_sequence(model, 10000, seed="The ", k=3)

In [None]:
print(shakey)

The three thing the woman will
    There were not beaten.
  Pedro. The world will be a mind, that the son of her side,  
    Where I have seen't to th' man.
  Ham. I wish my life.
    Then what this short on thee, that shall I stand
    When thou art a most star and so much son.
    There is no man at marching too, and the story
    Were to the chamber, and the words of mine,
    Whose presencious soul is all and this  
    A prophotition to the will and sorrow.
    There in mine ears will say this seat thou wouldst
    The sun that he should be the storm of man
    That shall be made and be the winder to him;
    Therefore be there as this so fair as hell.
    If they should see them all, I was not sent,
    When I have talk'd to be a service to
    The word of the chair, which this thou seest her,
    And there in his shaming thought of honesty,
    And see he hath but born and stores as stand,
    As if his honour was the contents of my son.
    I am too string and, as I shall be tr

In [None]:
f = open('/content/gdrive/My Drive/Colab Notebooks/NLP_with_pytorch/Data/ai_play.txt', 'w', encoding="utf8")
f.write(shakey)

10005