In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [2]:
# Define input sentences
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

# Initialize the Tokenizer class
# total number of words in word_index
tokenizer = Tokenizer(num_words=100, oov_token="<OOV>")
tokenizer 

<keras.src.preprocessing.text.Tokenizer at 0x29c2ebc1f50>

In [3]:
# Generate indices for each word in the corpus
tokenizer.fit_on_texts(sentences)

In [4]:
tokenizer.get_config()

{'num_words': 100,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': '<OOV>',
 'document_count': 4,
 'word_counts': '{"i": 2, "love": 3, "my": 4, "dog": 3, "cat": 1, "you": 2, "do": 1, "think": 1, "is": 1, "amazing": 1}',
 'word_docs': '{"i": 2, "dog": 3, "my": 4, "love": 3, "cat": 1, "you": 2, "is": 1, "do": 1, "think": 1, "amazing": 1}',
 'index_docs': '{"5": 2, "4": 3, "2": 4, "3": 3, "7": 1, "6": 2, "10": 1, "8": 1, "9": 1, "11": 1}',
 'index_word': '{"1": "<OOV>", "2": "my", "3": "love", "4": "dog", "5": "i", "6": "you", "7": "cat", "8": "do", "9": "think", "10": "is", "11": "amazing"}',
 'word_index': '{"<OOV>": 1, "my": 2, "love": 3, "dog": 4, "i": 5, "you": 6, "cat": 7, "do": 8, "think": 9, "is": 10, "amazing": 11}'}

In [5]:
# Get the indices and print it
word_index = tokenizer.word_index
print(word_index)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}


##### text to sequence

In [6]:
# Generate list of token sequences
new_sentence = ["I love my monkey do you think it is amazing",
                "i have a new cat"]

new_sequence = tokenizer.texts_to_sequences(new_sentence)

# Print the result
print(sentences)
print("\nWord Index = ", word_index)
print("\nSequences = ", new_sequence)

['I love my dog', 'I love my cat', 'You love my dog!', 'Do you think my dog is amazing?']

Word Index =  {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}

Sequences =  [[5, 3, 2, 1, 8, 6, 9, 1, 10, 11], [5, 1, 1, 1, 7]]


##### Sequence to Text

In [7]:
tokenizer.sequences_to_texts(new_sequence)

['i love my <OOV> do you think <OOV> is amazing', 'i <OOV> <OOV> <OOV> cat']

##### Padding

In [8]:
# Pad the sequences to a uniform length
padded = pad_sequences(new_sequence, maxlen=15,
                       padding='post', truncating='post')
print(padded)

[[ 5  3  2  1  8  6  9  1 10 11  0  0  0  0  0]
 [ 5  1  1  1  7  0  0  0  0  0  0  0  0  0  0]]
