In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
tokenizer = Tokenizer()

In [15]:
text_data = """It was the best of time,
it was the worst of time and alas i'm relaxed,
it was the age of wisdom, 
it was the age of foolishness
"""

In [16]:
tokenizer.fit_on_texts([text_data])

In [17]:
tokenizer.word_index

{'t': 1,
 's': 2,
 'e': 3,
 'i': 4,
 'o': 5,
 'w': 6,
 'a': 7,
 'h': 8,
 'f': 9,
 'it': 10,
 'was': 11,
 'the': 12,
 'of': 13,
 'm': 14,
 'g': 15,
 'time': 16,
 'age': 17,
 'b': 18,
 'r': 19,
 'd': 20,
 'l': 21,
 'n': 22,
 'best': 23,
 'worst': 24,
 'wisdom': 25,
 'foolishness': 26,
 'and': 27,
 'alas': 28,
 "i'm": 29,
 'relaxed': 30}

In [18]:
tokenizer.word_counts

OrderedDict([('i', 16),
             ('t', 24),
             ('w', 12),
             ('a', 12),
             ('s', 20),
             ('h', 10),
             ('e', 20),
             ('b', 2),
             ('o', 16),
             ('f', 10),
             ('m', 6),
             ('r', 2),
             ('g', 4),
             ('d', 2),
             ('l', 2),
             ('n', 2),
             ('it', 8),
             ('was', 8),
             ('the', 8),
             ('best', 2),
             ('of', 8),
             ('time', 4),
             ('worst', 2),
             ('age', 4),
             ('wisdom', 2),
             ('foolishness', 2),
             ('and', 1),
             ('alas', 1),
             ("i'm", 1),
             ('relaxed', 1)])

In [22]:

input_sequences = []

for sentence in text_data.split('\n'):
    
    tokenized_sent = tokenizer.texts_to_sequences([sentence])[0]
    
    for i in range(1,len(tokenized_sent)):
        input_sequences.append(tokenized_sent[:i+1])
        

In [23]:
input_sequences

[[10, 11],
 [10, 11, 12],
 [10, 11, 12, 23],
 [10, 11, 12, 23, 13],
 [10, 11, 12, 23, 13, 16],
 [10, 11],
 [10, 11, 12],
 [10, 11, 12, 24],
 [10, 11, 12, 24, 13],
 [10, 11, 12, 24, 13, 16],
 [10, 11, 12, 24, 13, 16, 27],
 [10, 11, 12, 24, 13, 16, 27, 28],
 [10, 11, 12, 24, 13, 16, 27, 28, 29],
 [10, 11, 12, 24, 13, 16, 27, 28, 29, 30],
 [10, 11],
 [10, 11, 12],
 [10, 11, 12, 17],
 [10, 11, 12, 17, 13],
 [10, 11, 12, 17, 13, 25],
 [10, 11],
 [10, 11, 12],
 [10, 11, 12, 17],
 [10, 11, 12, 17, 13],
 [10, 11, 12, 17, 13, 26]]

In [28]:
max_len = max([len(x) for x in input_sequences])

In [29]:
max_len

10

In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [33]:
padded_input_seq = pad_sequences(input_sequences,max_len,padding='pre')

In [34]:
padded_input_seq

array([[ 0,  0,  0,  0,  0,  0,  0,  0, 10, 11],
       [ 0,  0,  0,  0,  0,  0,  0, 10, 11, 12],
       [ 0,  0,  0,  0,  0,  0, 10, 11, 12, 23],
       [ 0,  0,  0,  0,  0, 10, 11, 12, 23, 13],
       [ 0,  0,  0,  0, 10, 11, 12, 23, 13, 16],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 10, 11],
       [ 0,  0,  0,  0,  0,  0,  0, 10, 11, 12],
       [ 0,  0,  0,  0,  0,  0, 10, 11, 12, 24],
       [ 0,  0,  0,  0,  0, 10, 11, 12, 24, 13],
       [ 0,  0,  0,  0, 10, 11, 12, 24, 13, 16],
       [ 0,  0,  0, 10, 11, 12, 24, 13, 16, 27],
       [ 0,  0, 10, 11, 12, 24, 13, 16, 27, 28],
       [ 0, 10, 11, 12, 24, 13, 16, 27, 28, 29],
       [10, 11, 12, 24, 13, 16, 27, 28, 29, 30],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 10, 11],
       [ 0,  0,  0,  0,  0,  0,  0, 10, 11, 12],
       [ 0,  0,  0,  0,  0,  0, 10, 11, 12, 17],
       [ 0,  0,  0,  0,  0, 10, 11, 12, 17, 13],
       [ 0,  0,  0,  0, 10, 11, 12, 17, 13, 25],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 10, 11],
       [ 0,  0,  0, 

In [52]:
X = padded_input_seq[:,:-1]

In [54]:
X

array([[ 0,  0,  0,  0,  0,  0,  0,  0, 10],
       [ 0,  0,  0,  0,  0,  0,  0, 10, 11],
       [ 0,  0,  0,  0,  0,  0, 10, 11, 12],
       [ 0,  0,  0,  0,  0, 10, 11, 12, 23],
       [ 0,  0,  0,  0, 10, 11, 12, 23, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 10],
       [ 0,  0,  0,  0,  0,  0,  0, 10, 11],
       [ 0,  0,  0,  0,  0,  0, 10, 11, 12],
       [ 0,  0,  0,  0,  0, 10, 11, 12, 24],
       [ 0,  0,  0,  0, 10, 11, 12, 24, 13],
       [ 0,  0,  0, 10, 11, 12, 24, 13, 16],
       [ 0,  0, 10, 11, 12, 24, 13, 16, 27],
       [ 0, 10, 11, 12, 24, 13, 16, 27, 28],
       [10, 11, 12, 24, 13, 16, 27, 28, 29],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 10],
       [ 0,  0,  0,  0,  0,  0,  0, 10, 11],
       [ 0,  0,  0,  0,  0,  0, 10, 11, 12],
       [ 0,  0,  0,  0,  0, 10, 11, 12, 17],
       [ 0,  0,  0,  0, 10, 11, 12, 17, 13],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 10],
       [ 0,  0,  0,  0,  0,  0,  0, 10, 11],
       [ 0,  0,  0,  0,  0,  0, 10, 11, 12],
       [ 0

In [55]:
y = padded_input_seq[:,-1]

In [56]:
y

array([11, 12, 23, 13, 16, 11, 12, 24, 13, 16, 27, 28, 29, 30, 11, 12, 17,
       13, 25, 11, 12, 17, 13, 26], dtype=int32)