<a href="https://colab.research.google.com/github/vibhutidabas/Learning/blob/main/NLP/auto_text_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
print(tf.__version__)

2.7.0


In [2]:
import string
import requests

In [3]:
response = requests.get('https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt')
data = response.text.split('\n')

In [4]:
data = data[253:]  #work of shakespeare starts from line 254
data[0]

'  From fairest creatures we desire increase,'

In [5]:
data = " ".join(data)

In [6]:
# remove punctuations
def clean_text(doc):
  tokens = doc.split()
  table = str.maketrans('','',string.punctuation)               #removing punc.
  tokens = [ (w.translate(table)) for w in tokens]              #list w/out punc.
  tokens = [word.lower() for word in tokens if word.isalpha()]  #removing special charac.
  return tokens

In [7]:
tokens = clean_text(data)
print(tokens[:10])
print(len(set(tokens)))

['from', 'fairest', 'creatures', 'we', 'desire', 'increase', 'that', 'thereby', 'beautys', 'rose']
27956


In [8]:
length = 51   #we will use 50 words to train the model to predict
lines=[]
for i in range(length, len(tokens)):
  seq = tokens[i-length:i]           #sequence from 0 to 51
  line = ' '.join(seq)               #join tokens to form a line
  lines.append(line)                 #a line is a sequence of 51 words
  if i>200000:                       #take first 200000 words to tain
    break

In [9]:
print(len(lines))
lines[0]

199951


'from fairest creatures we desire increase that thereby beautys rose might never die but as the riper should by time decease his tender heir might bear his memory but thou contracted to thine own bright eyes feedst thy lights flame with selfsubstantial fuel making a famine where abundance lies thy self'

In [10]:
tokens[0], tokens[50]

('from', 'self')

Tokenization

In [11]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)   #noew word embedding will be done in which every token is assigned an integer
#converting text data to numerical values
sequences = tokenizer.texts_to_sequences(lines)  #will have list of integer values created by tokenizer

In [13]:
sequences = np.array(sequences)  #sequnecs array to numpy array

In [14]:
#rows= line of play, columns= first 50 words ar x and 51st column is y
x, y= sequences[:,:-1], sequences[:,-1]
#x=all but the last, y= last column

In [15]:
x[0]

array([   47,  1408,  1264,    37,   451,  1406,     9,  2766,  1158,
        1213,   171,   132,   269,    20,    24,     1,  4782,    87,
          30,    98,  4781,    18,   715,  1263,   171,   211,    18,
         829,    20,    27,  3807,     4,   214,   121,  1212,   153,
       13004,    31,  2765,  1847,    16, 13003, 13002,   754,     7,
        3806,    99,  2430,   466,    31])

In [16]:
x[0].shape, y[0]

((50,), 307)

In [17]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'i': 3,
 'to': 4,
 'of': 5,
 'you': 6,
 'a': 7,
 'my': 8,
 'that': 9,
 'in': 10,
 'is': 11,
 'not': 12,
 'it': 13,
 'for': 14,
 'me': 15,
 'with': 16,
 'he': 17,
 'his': 18,
 'your': 19,
 'but': 20,
 'be': 21,
 'this': 22,
 'have': 23,
 'as': 24,
 'him': 25,
 'so': 26,
 'thou': 27,
 'will': 28,
 'what': 29,
 'by': 30,
 'thy': 31,
 'no': 32,
 'are': 33,
 'all': 34,
 'her': 35,
 'do': 36,
 'we': 37,
 'if': 38,
 'our': 39,
 'or': 40,
 'shall': 41,
 'thee': 42,
 'which': 43,
 'on': 44,
 'lord': 45,
 'o': 46,
 'from': 47,
 'good': 48,
 'more': 49,
 'sir': 50,
 'was': 51,
 'they': 52,
 'well': 53,
 'at': 54,
 'would': 55,
 'when': 56,
 'now': 57,
 'come': 58,
 'love': 59,
 'th': 60,
 'than': 61,
 'am': 62,
 'then': 63,
 'she': 64,
 'their': 65,
 'them': 66,
 'how': 67,
 'enter': 68,
 'let': 69,
 'did': 70,
 'ill': 71,
 'hath': 72,
 'one': 73,
 'us': 74,
 'know': 75,
 'first': 76,
 'make': 77,
 'had': 78,
 'like': 79,
 'here': 80,
 'upon': 81,
 'there': 82,
 'man': 83,


In [18]:
vocab_size = len(tokenizer.word_index)+1
vocab_size

13009

In [19]:
y = to_categorical(y, num_classes=vocab_size)

In [20]:
x[1].shape

(50,)

In [21]:
seq_len = x.shape[1]

### Building LSTM

In [22]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=seq_len))
model.add(LSTM(units=100, return_sequences=True))
model.add(LSTM(units=100))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=vocab_size,activation='softmax'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            650450    
                                                                 
 lstm (LSTM)                 (None, 50, 100)           60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 13009)             1313909   
                                                                 
Total params: 2,115,259
Trainable params: 2,115,259
Non-trainable params: 0
_________________________________________________________________


In [23]:
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
model.fit(x,y,batch_size=10,epochs=20)

In [None]:
lines[12343]  #put this line as seed_text
seed_text = lines[12343]

def generate_text(model, tokenizer, text_seq_length, seed_text, n_words):
  text=[]
  for _ in range(n_words):
    encoded = tokenizer.text_to_sequences(seed_text[0])
    encoded = pad_sequences([encoded], maxlen=text_seq_length, truncating='pre')

    y_pred = model.predict_classes(encoded)
    predicted_word = ''
    for word, inedx in tokenizer.word_index.items():
      if index == y_pred:
        pred_word=word
        break
    seed_text = seed_text + ' ' +pred_word
    text.append(pred_word)
  return ' '.join(text)
  

In [None]:
generate_text_seq(model, tokenizer, seq_length, seed_text, 10)