The task is to explore how to preprocess text in Tensorflow.

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# sentences from "The Hobbit, or There and Back Again" by J. R. R. Tolkien
sentences = [
    'In a hole in the ground there lived a hobbit.',
    'It had a perfectly round door like a porthole, painted green, with a shiny yellow brass knob in the exact middle.',
    'Tales and adventures sprouted up all over the place wherever he went, in the most extraordinary fashion.',
    'All that the unsuspecting Bilbo saw that morning was an old man with a staff.',
    'But Gandalf looked at him from under long bushy eyebrows that stuck out further than the brim of his shady hat.'
]

In [4]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [5]:
print(sequences)

[[3, 1, 7, 3, 2, 8, 9, 10, 1, 11], [12, 13, 1, 14, 15, 16, 17, 1, 18, 19, 20, 5, 1, 21, 22, 23, 24, 3, 2, 25, 26], [27, 28, 29, 30, 31, 6, 32, 2, 33, 34, 35, 36, 3, 2, 37, 38, 39], [6, 4, 2, 40, 41, 42, 4, 43, 44, 45, 46, 47, 5, 1, 48], [49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 4, 59, 60, 61, 62, 2, 63, 64, 65, 66, 67]]


In [6]:
tokenizer.word_index

{'a': 1,
 'the': 2,
 'in': 3,
 'that': 4,
 'with': 5,
 'all': 6,
 'hole': 7,
 'ground': 8,
 'there': 9,
 'lived': 10,
 'hobbit': 11,
 'it': 12,
 'had': 13,
 'perfectly': 14,
 'round': 15,
 'door': 16,
 'like': 17,
 'porthole': 18,
 'painted': 19,
 'green': 20,
 'shiny': 21,
 'yellow': 22,
 'brass': 23,
 'knob': 24,
 'exact': 25,
 'middle': 26,
 'tales': 27,
 'and': 28,
 'adventures': 29,
 'sprouted': 30,
 'up': 31,
 'over': 32,
 'place': 33,
 'wherever': 34,
 'he': 35,
 'went': 36,
 'most': 37,
 'extraordinary': 38,
 'fashion': 39,
 'unsuspecting': 40,
 'bilbo': 41,
 'saw': 42,
 'morning': 43,
 'was': 44,
 'an': 45,
 'old': 46,
 'man': 47,
 'staff': 48,
 'but': 49,
 'gandalf': 50,
 'looked': 51,
 'at': 52,
 'him': 53,
 'from': 54,
 'under': 55,
 'long': 56,
 'bushy': 57,
 'eyebrows': 58,
 'stuck': 59,
 'out': 60,
 'further': 61,
 'than': 62,
 'brim': 63,
 'of': 64,
 'his': 65,
 'shady': 66,
 'hat': 67}

In [7]:
data = pad_sequences(sequences)
print(data)

[[ 0  0  0  0  0  0  0  0  0  0  0  3  1  7  3  2  8  9 10  1 11]
 [12 13  1 14 15 16 17  1 18 19 20  5  1 21 22 23 24  3  2 25 26]
 [ 0  0  0  0 27 28 29 30 31  6 32  2 33 34 35 36  3  2 37 38 39]
 [ 0  0  0  0  0  0  6  4  2 40 41 42  4 43 44 45 46 47  5  1 48]
 [49 50 51 52 53 54 55 56 57 58  4 59 60 61 62  2 63 64 65 66 67]]
