In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import networkx as nx

In [5]:
path_to_dataset='/home/varun/Desktop/NLP-Hamlet by William Shakespeare.TXT'

In [6]:
text=open(path_to_dataset,'r').read()
text=text.lower()
sen=list(text.split(" "))

In [7]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ] 


In [8]:
sentences=[]
for word in stopwords:
    token = " " + word + " "
    text = text.replace(token, " ")
    text = text.replace("  ", " ")
sentences.append(text)

In [9]:
# print(sentences[:1000])


In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [11]:
tokenizer=Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

word_index=tokenizer.word_index
len_word_index=len(word_index)

word_index

{'<OOV>': 1,
 'ham': 2,
 'not': 3,
 'and': 4,
 'lord': 5,
 'haue': 6,
 'king': 7,
 'the': 8,
 'will': 9,
 'that': 10,
 'no': 11,
 'you': 12,
 'to': 13,
 'it': 14,
 'i': 15,
 'shall': 16,
 'hamlet': 17,
 'come': 18,
 'thou': 19,
 'good': 20,
 'hor': 21,
 'now': 22,
 'let': 23,
 'this': 24,
 'thy': 25,
 'may': 26,
 'me': 27,
 'enter': 28,
 'for': 29,
 'like': 30,
 'him': 31,
 'but': 32,
 'oh': 33,
 'a': 34,
 'so': 35,
 'well': 36,
 'as': 37,
 'know': 38,
 'of': 39,
 'selfe': 40,
 'loue': 41,
 "'tis": 42,
 'vs': 43,
 'sir': 44,
 'qu': 45,
 'must': 46,
 'laer': 47,
 'giue': 48,
 'one': 49,
 'thee': 50,
 'ile': 51,
 'in': 52,
 'hath': 53,
 'ophe': 54,
 'with': 55,
 'make': 56,
 'speake': 57,
 'time': 58,
 'is': 59,
 'see': 60,
 'father': 61,
 'say': 62,
 'vpon': 63,
 'doe': 64,
 'go': 65,
 'pol': 66,
 'man': 67,
 'heere': 68,
 'tell': 69,
 'heauen': 70,
 'mine': 71,
 'thus': 72,
 'play': 73,
 'much': 74,
 'rosin': 75,
 'then': 76,
 'thinke': 77,
 'can': 78,
 'or': 79,
 'horatio': 80,
 'moth

In [12]:
sequence=tokenizer.texts_to_sequences(sentences)
pad_sequence=pad_sequences(sequence,truncating="post",padding='post')

In [13]:
pad_sequence.shape

(1, 21236)

In [14]:
pad_sequence=np.array(pad_sequence)

In [15]:
pad_sequence.shape

(1, 21236)

In [16]:
flatten_pad_sequence=pad_sequence.flatten()
flatten_pad_sequence.shape

(21236,)

In [17]:
seq_length=130
total_num_seq=len(text)//(seq_length+1)

In [18]:
total_num_seq

1034

In [19]:
import tensorflow as tf
char_dataset=tf.data.Dataset.from_tensor_slices(flatten_pad_sequence)

In [20]:
sequence=char_dataset.batch(seq_length+1, drop_remainder=True)

In [21]:
def create_seq_target(seq):
    input_text=seq[:-1]
    target_text=seq[1:]
    
    return input_text,target_text

In [22]:
dataset=sequence.map(create_seq_target)

In [23]:
batch_size=128
buffer_size=1000
dataset=dataset.shuffle(buffer_size).batch(batch_size,drop_remainder=True)

In [24]:
dataset

<BatchDataset shapes: ((128, 130), (128, 130)), types: (tf.int32, tf.int32)>

In [25]:
vocab_size=len_word_index
embed_dim=70
rr_neuron=1024
from keras.losses import sparse_categorical_crossentropy
def sparse(y_true,y_pred):
    return sparse_categorical_crossentropy(y_true,y_pred,from_logits=True)

Using TensorFlow backend.


In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Embedding,GRU
def create_model(vocab_size,embed_dim,rr_neuron,batch_size):
    model=Sequential()
    model.add(Embedding(vocab_size,embed_dim,batch_input_shape=[batch_size,None]))
    model.add(GRU(rr_neuron,return_sequences=True, stateful=True,recurrent_initializer='glorot_uniform'))
    model.add(Dense(vocab_size))
    model.compile(optimizer='adam',loss=sparse)
    return model


In [27]:
model=create_model(vocab_size=vocab_size, embed_dim=embed_dim, rr_neuron=rr_neuron,batch_size=batch_size)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (128, None, 70)           374010    
_________________________________________________________________
gru (GRU)                    (128, None, 1024)         3366912   
_________________________________________________________________
dense (Dense)                (128, None, 5343)         5476575   
Total params: 9,217,497
Trainable params: 9,217,497
Non-trainable params: 0
_________________________________________________________________


In [30]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# print(reverse_word_index)
# print(reverse_word_index[1])

In [29]:
#Checking Model Creation before training
for input_example,target_example in dataset.take(1):
    target_example=model(input_example)
    # print(input_example)
sample_indices=tf.random.categorical(target_example[0],num_samples=1)
sample_indices=tf.squeeze(sample_indices,axis=-1).numpy()
sample_indices

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(sample_indices))
# print(dataset.take(1).[input_example])


sayest temperance contrary aduice amber scarse bell com'st admirable begun takes exit creepe crie poesie closset bestow'd thinke flints bounds gyant melodious cicatrice stately obiect dreame gate secundus forbeare matine coniuration wharfe affraide teeth terrible almost against applicable read broken daysie antike sides ominous v howsoeuer transcription sequell window ont vnmastred contacting grandsire volly iuggel'd stops betoken vnschool'd spade shreds towards suiting botch comfort iot hum hammers lights falsely illo prisoner artlesse bloodie lust compact yeomans ladiship vnhappily zone lines osr equiuocation napkin fancie portraiture excellent squeezing throwne nickname electronic lights waste talke deepe holde aeneas vniuersity intruding perseuer intended iest england hands william mantle what breuitie gidge espials discover com sodainely caution choyse 3 passionate knowes seales look'd iocond crocodile pronounc'd lipps vsurpe conception carnegie delights turneth cou'nant necke


In [130]:
epochs=30
model.fit(dataset,epochs=epochs)
#Saving the model
model.save('Text_geneartor.h5') 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [131]:
from tensorflow.keras.models import load_model

"""
You can load the trained model by invoking the comments below

"""
model=create_model(vocab_size=vocab_size, embed_dim=embed_dim, rr_neuron=rr_neuron,batch_size=1)
model.load_weights('/content/Text_geneartor.h5')
model.build(tf.TensorShape([1,None]))
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 70)             374010    
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3366912   
_________________________________________________________________
dense_2 (Dense)              (1, None, 5343)           5476575   
Total params: 9,217,497
Trainable params: 9,217,497
Non-trainable params: 0
_________________________________________________________________


In [153]:
print(model.summary())
#Text Generator function to generate the random text
def generate_text(model,start_text,gen_size=50,temp=1.0):
  num_generate=gen_size
  seq=tokenizer.texts_to_sequences(start_text)
  input_val=pad_sequences(seq)
  input_val=input_val.flatten()
  input_val=list(input_val)

  
  #Converting the input to the desired shape to feed the model
  input_val=tf.expand_dims(input_val,0)


  text_generated=[]
  temprature=temp

  model.reset_states()

  for i in range(num_generate):
    predictions=model(input_val)
    predictions=tf.squeeze(predictions,0)
    predictions=predictions/temprature
    predicted_id=tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()

    input_val=tf.expand_dims([predicted_id],0)

    
    pred_text=(reverse_word_index[predicted_id])
    text_generated.append(pred_text)
    # print(text_generated)

  return(start_text + " ".join(text_generated))


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (1, None, 70)             374010    
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3366912   
_________________________________________________________________
dense_2 (Dense)              (1, None, 5343)           5476575   
Total params: 9,217,497
Trainable params: 9,217,497
Non-trainable params: 0
_________________________________________________________________
None


In [154]:
print(generate_text(model,"HAMLET ",gen_size=1000))

HAMLETlets that alas polon the hath robustious to israel too downe haue enter heauen login ile divulging with am will wager often adue 'em rosincrance teach let man comes paint age election fairely project him throne 000 purport to dead vnripe then conceited fauourites pol pay assault station mock will say from lord seales haue as fashion peace as poet ham this since these low 'tis remedie breath although chace poyson lifted breake despight put vp ioue one and till stealers fauour king the sonne ham are i bound blew at much doomesday a seemes any sweet produce thinke gone present old might direct begin will king whose haue giues sure fiue such falles maiesticall thee harsh shamefull 'tis it coated drag'd you no all beauteous ham plain desart i of which it qu shall polon little ham bed excellent hor for steps possible worse heauen royalty rich ioue variable thus ham sing me denmarke assaies you guts owne it make first if husband am will that himselfe children aske delete deepe oh discou

In [99]:
# start_text="Hamlet"
# seq=tokenizer.texts_to_sequences(start_text)
# input_val=pad_sequences(sequences=seq)
# input_val=(input_val.flatten())
# input_val=list(input_val)
# index=[x for x in input_val]
# index=tf.expand_dims(index,0)

In [100]:
# input_val=tf.expand_dims(input_val,0)
# print(input_val)
# print(index)
# input_val.shape

tf.Tensor([[   1   34 1895 2870 2268    1]], shape=(1, 6), dtype=int32)
tf.Tensor([[   1   34 1895 2870 2268    1]], shape=(1, 6), dtype=int32)


TensorShape([1, 6])

In [146]:
# text_generated=[]
# temprature=1.0
# num_generate=1000
# model.reset_states()

# for i in range(num_generate):
#   predictions=model(input_val)
#   predictions=tf.squeeze(predictions,0)
#   predictions=predictions/temprature
#   predicted_id=tf.random.categorical(predictions,num_samples=1)[-1,0].numpy()

#   input_val=tf.expand_dims([predicted_id],0)

#   text_generated.append(reverse_word_index[predicted_id])

In [31]:
# str =" "
# print(str.join(text_generated))