## Tokenizing:

In [1]:
import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer 
import warnings 
warnings.filterwarnings('ignore')

print(tf.__version__)

1.15.0


In [2]:
sentences = [
    'I love my cat',
    'I love my dog'
]

In [3]:
tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index 
print(word_index)

{'i': 1, 'love': 2, 'my': 3, 'cat': 4, 'dog': 5}


In [6]:
# For Unbalanced Sentences 
sentences = [
    'I love my cat',
    'I really like NLP',
    'Today is sundays'
]

tokenizer = Tokenizer(num_words = 100)
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index 

print('\nword_index: ', word_index)

sequences = tokenizer.texts_to_sequences(sentences)
print('\nSequences: ', sequences)


word_index:  {'i': 1, 'love': 2, 'my': 3, 'cat': 4, 'really': 5, 'like': 6, 'nlp': 7, 'today': 8, 'is': 9, 'sundays': 10}

Sequences:  [[1, 2, 3, 4], [1, 5, 6, 7], [8, 9, 10]]


In [8]:
test_sentences = [
    'I like to go for hikes',
    'Mudassir is my nephew'
]

test_seq = tokenizer.texts_to_sequences(test_sentences)

print('\nTest Sequences: ', test_seq)


Test Sequences:  [[1, 6], [9, 3]]


In [12]:
sentences = [
    'my job is to analyze the data',
    'and make predictions out of it'
]

tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

test_sequences = tokenizer.texts_to_sequences(sentences)

print('\nWord Index: ', word_index)
print('\nTest Sequences: ', test_sequences)

test_seq2 = tokenizer.texts_to_sequences(['this is a test sentence', 
                                         'I like to go for walk'])

print('\nWord Index: ', word_index)
print('\nTest Sequence 2: ', test_seq2)


Word Index:  {'<OOV>': 1, 'my': 2, 'job': 3, 'is': 4, 'to': 5, 'analyze': 6, 'the': 7, 'data': 8, 'and': 9, 'make': 10, 'predictions': 11, 'out': 12, 'of': 13, 'it': 14}

Test Sequences:  [[2, 3, 4, 5, 6, 7, 8], [9, 10, 11, 12, 13, 14]]

Word Index:  {'<OOV>': 1, 'my': 2, 'job': 3, 'is': 4, 'to': 5, 'analyze': 6, 'the': 7, 'data': 8, 'and': 9, 'make': 10, 'predictions': 11, 'out': 12, 'of': 13, 'it': 14}

Test Sequence 2:  [[1, 4, 1, 1, 1], [1, 1, 5, 1, 1, 1]]


## Paddings:

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [18]:
sentences = [
    'I like python',
    'I am becoming an ML Engineer',
    'I am a junior level data scientist'
]

tokenizer = Tokenizer(num_words = 100, oov_token = "<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)

padded = pad_sequences(sequences, maxlen = 10)

print('\nWord Indexes :', word_index)
print('\nInteger Sequences: ', sequences)
print('\nPadded Integer Sequences ', padded)

print('=' * 20)

test_sentences = [
    'this is a test sequence',
    'let us see what happens',
    'when we test this out!'
]

test_sequences = tokenizer.texts_to_sequences(test_sentences)

padded = pad_sequences(test_sequences, maxlen = 10)

print('\nTest Sequences: ', test_sequences)
print('\nPadded Sequences: ', padded)

print(type(padded))


Word Indexes : {'<OOV>': 1, 'i': 2, 'am': 3, 'like': 4, 'python': 5, 'becoming': 6, 'an': 7, 'ml': 8, 'engineer': 9, 'a': 10, 'junior': 11, 'level': 12, 'data': 13, 'scientist': 14}

Integer Sequences:  [[2, 4, 5], [2, 3, 6, 7, 8, 9], [2, 3, 10, 11, 12, 13, 14]]

Padded Integer Sequences  [[ 0  0  0  0  0  0  0  2  4  5]
 [ 0  0  0  0  2  3  6  7  8  9]
 [ 0  0  0  2  3 10 11 12 13 14]]

Test Sequences:  [[1, 1, 10, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]

Padded Sequences:  [[ 0  0  0  0  0  1  1 10  1  1]
 [ 0  0  0  0  0  1  1  1  1  1]
 [ 0  0  0  0  0  1  1  1  1  1]]
<class 'numpy.ndarray'>
