<a href="https://colab.research.google.com/github/tytyty93/LazyProgrammer/blob/master/LP_Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Creating Dummy Sentence

In [3]:
sentences = ['I like eggs and ham.', 'I love chocolate and bunnies.', 'I hate onions.']

# Define Max Vocab size

In [23]:
MAX_VOCAB_SIZE = 2000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

print(sequences)

[[1, 3, 4, 2, 5], [1, 6, 7, 2, 8], [1, 9, 10]]


# Displaying word to index mapping

In [21]:
tokenizer.word_index

{'and': 2,
 'bunnies': 8,
 'chocolate': 7,
 'eggs': 4,
 'ham': 5,
 'hate': 9,
 'i': 1,
 'like': 3,
 'love': 6,
 'onions': 10}

# Padding the Sequences

In [22]:
# Using default values
data = pad_sequences(sequences)
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1  9 10]]


In [24]:
# Setting the max sequence length to 5. It's the same as the top one as the max is 5 for both.
MAX_SEQUENCE_LENGTH = 5
data = pad_sequences(sequences,maxlen=MAX_SEQUENCE_LENGTH)
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 0  0  1  9 10]]


In [25]:
# Setting post padding. You can see the last row has 2 0s at the end instead of the start 
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH, padding = 'post')
print(data)

[[ 1  3  4  2  5]
 [ 1  6  7  2  8]
 [ 1  9 10  0  0]]


In [26]:
# Too much padding. Will give a 0 at the beginning as the length is 6 instead of 5
data = pad_sequences(sequences, maxlen=6)
print(data)

[[ 0  1  3  4  2  5]
 [ 0  1  6  7  2  8]
 [ 0  0  0  1  9 10]]


In [27]:
# Truncation. This will remove the first integer in the list as the max length is 4 now instead of 5
data = pad_sequences(sequences, maxlen=4)
print(data)

[[ 3  4  2  5]
 [ 6  7  2  8]
 [ 0  1  9 10]]


In [28]:
# Setting Truncation to post now. The ends are cut off instead of beginnings.
data = pad_sequences(sequences, maxlen=4, truncating='post')
print(data)

[[ 1  3  4  2]
 [ 1  6  7  2]
 [ 0  1  9 10]]
