## Tensorflow Tokenizer:

Lets go ahead and __Tokenize__ our sentences using Tensorflow built-in __APIs__. This gives us a dictionary of each unique word associated a numerical value, which later on we can use to feed our model.

In [8]:
import tensorflow as tf
import warnings 
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

warnings.filterwarnings("ignore")

In [9]:
print(tf.__version__)

1.14.0


In [10]:
sentences = [
    "I love my cat",
    "I love my dog"
]

In [11]:
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index # assings index to each unique word!
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'cat': 4, 'dog': 5}


## Tokenizing Unbalanced Sequences:

In [14]:
sentences = [
    "I love my dog",
    "I love my cat",
    "Do you think my dog is amazing?"
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences) # sequences of textual data in the form of list

print(word_index)
print(sequences)

{'my': 1, 'i': 2, 'love': 3, 'dog': 4, 'cat': 5, 'do': 6, 'you': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[2, 3, 1, 4], [2, 3, 1, 5], [6, 7, 8, 1, 4, 9, 10]]


In [17]:
test_sentences = [
    "I really love my dog",
    "My dog is amazing!"
]

test_seq = tokenizer.texts_to_sequences(test_sentences)

print(test_seq) # trains on previous text and gives the sequences same as previous on test data

[[2, 3, 1, 4], [1, 4, 9, 10]]


## Adding "OOV" Tag for Unseen Word:

In [19]:
sentences = [
    "I love my dog",
    "I love my cat",
    "Do you think my dog is amazing?"
]

tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences) # sequences of textual data in the form of list

test_sentences = [
    "I really love my dog",
    "My dog is amazing!"
]

test_seq = tokenizer.texts_to_sequences(test_sentences)
print(test_seq)
print(word_index)

[[3, 1, 4, 2, 5], [2, 5, 10, 11]]
{'<OOV>': 1, 'my': 2, 'i': 3, 'love': 4, 'dog': 5, 'cat': 6, 'do': 7, 'you': 8, 'think': 9, 'is': 10, 'amazing': 11}
