In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Word index

In [None]:
#@title word index without \<OOV\>
sentences = [
    'i love my dog',
    'I, love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
    ]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences) # generate indices for each word in the corpus
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

print(word_index)
print(sequences)

# if we use the same tokenizer on other text
other_sentences = [
    'I really love my dog',
    'my dog loves my manatee'
]
test_seq = tokenizer.texts_to_sequences(other_sentences)
print(test_seq)
# for some words indices were not generated bcz we fit tokenizer on other corpus of text

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]
[[4, 2, 1, 3], [1, 3, 1]]


In [None]:
#@title word index with \<OOV\>
tokenizer = Tokenizer(num_words = 100, oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(word_index)

test_seq = tokenizer.texts_to_sequences(other_sentences)
print(test_seq)
# all words not in word index will be treated as OOV (out of vocabulary) token


{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


# Padding

When we fed images into the network for training, we needed them to be uniform in size and used generators to resize the image to fit.  
There is similar requirement for text. Before we can train with text we need to have some uniformity of size.

In [None]:
padded = pad_sequences(sequences) # default padding='pre'
padded_custom_params = pad_sequences(sequences, padding='post', truncating='post', maxlen=5)
print(word_index)
print(sequences)
print(padded)
print(padded_custom_params)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]
[[ 0  0  0  4  2  1  3]
 [ 0  0  0  4  2  1  6]
 [ 0  0  0  5  2  1  3]
 [ 7  5  8  1  3  9 10]]
[[4 2 1 3 0]
 [4 2 1 6 0]
 [5 2 1 3 0]
 [7 5 8 1 3]]
