<a href="https://colab.research.google.com/github/yeesem/Natural-Laguage-Processing/blob/main/Tokenizer_Basic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Define input sentences
sentences = [
    'I love my dog',
    'I, love my cat'
]

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = 100)

# Generate indices for each word in the corpus
tokenizer.fit_on_texts(sentences`)

# Generate indices for each word in the corpus
# By default, all punctuation is ignored and words are converted to lower case.
word_index = tokenizer.word_index
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'dog': 4, 'cat': 5}


In [5]:
# Define input sentences
sentences = [
    'I love my dog',
    'I, love my cat',
    'You love my dog!',
    "He loves my dog"
]

# Initialize the Tokenizer class
tokenizer = Tokenizer(num_words = 1)

# Generate indices for each word in the corpus
tokenizer.fit_on_texts(sentences)

# Get the indices and print it
# indices of 'love' and 'loves' are different --- Takes note
word_index = tokenizer.word_index
print(word_index)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'cat': 5, 'you': 6, 'he': 7, 'loves': 8}


**Generating Sequences and padding**

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the input texts
sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog',
    'Do you think my dog is missing'
]

# Initilize the Tokenizer class
tokenizer = Tokenizer(num_words = 100,oov_token ='<OOV>')

# Tokenize the input sequence
tokenizer.fit_on_texts(sentences)

# Get the word index dictionary
word_index = tokenizer.word_index

# Generate list of token sequences
sequences = tokenizer.texts_to_sequences(sentences)

# Print the result
print("Word index : ",word_index)
print("Sequences  : ",sequences)

Word index :  {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'missing': 11}
Sequences  :  [[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]


**Padding**

In [8]:
# Pad the sequences to a uniform length
padded = pad_sequences(sequences,maxlen=5,truncating = 'post',padding = 'post')

# Print the result
print("Padded Sequences:")
print(padded)

Padded Sequences:
[[5 3 2 4 0]
 [5 3 2 7 0]
 [6 3 2 4 0]
 [8 6 9 2 4]]


**Out-of-vocabulary tokens**

In [11]:
# Try with words that the tokenizer wasn't fit to
test_data = [
    'I really love my dog',
    'My dog loves my manatee'
]

# Generate the sequences
test_seq = tokenizer.texts_to_sequences(test_data)

# Print the word index dictionary
print("Word index : ",word_index)

# Print the sequence with OOV
print("Test Sequence : ",test_seq)

# Print the padded result
padded = pad_sequences(test_seq,maxlen = 10,padding = 'post')
print("Padded Test Sequences: ")
print(padded)

Word index :  {'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'missing': 11}
Test Sequence :  [[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]
Padded Test Sequences: 
[[5 1 3 2 4 0 0 0 0 0]
 [2 4 1 2 1 0 0 0 0 0]]
