# Text Preprocessing

In [None]:
import tensorflow as tf

## Adding a dataset

In [None]:
sentences = [
    "Her face a river.",
    "This is a curse,",
    "a blessing too.",
    "And then you leave and I stand outside screaming.",
    "You leave and rain runs through me for days.",
    "I call every storm that hits me by your name."
]

## Essentials

### Fitting Tokenizer

In [None]:
MAX_VOCAB_SIZE = 20000

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

In [None]:
print(sequences)

[[7, 8, 1, 9], [10, 11, 1, 12], [1, 13, 14], [2, 15, 3, 4, 2, 5, 16, 17, 18], [3, 4, 2, 19, 20, 21, 6, 22, 23], [5, 24, 25, 26, 27, 28, 6, 29, 30, 31]]


In [None]:
print(tokenizer.word_index)

{'a': 1, 'and': 2, 'you': 3, 'leave': 4, 'i': 5, 'me': 6, 'her': 7, 'face': 8, 'river': 9, 'this': 10, 'is': 11, 'curse': 12, 'blessing': 13, 'too': 14, 'then': 15, 'stand': 16, 'outside': 17, 'screaming': 18, 'rain': 19, 'runs': 20, 'through': 21, 'for': 22, 'days': 23, 'call': 24, 'every': 25, 'storm': 26, 'that': 27, 'hits': 28, 'by': 29, 'your': 30, 'name': 31}


### Padding the sequences

Padding sequences so that we get a N x T matrix, which can be used as an input matrix in most of the models (eg. RNN).

In [None]:
data = tf.keras.preprocessing.sequence.pad_sequences(sequences)
print(data)

[[ 0  0  0  0  0  0  7  8  1  9]
 [ 0  0  0  0  0  0 10 11  1 12]
 [ 0  0  0  0  0  0  0  1 13 14]
 [ 0  2 15  3  4  2  5 16 17 18]
 [ 0  3  4  2 19 20 21  6 22 23]
 [ 5 24 25 26 27 28  6 29 30 31]]


### Custom length padding

In [None]:
MAX_SEQUENCE_LENGTH = 10
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data)

[[ 0  0  0  0  0  0  7  8  1  9]
 [ 0  0  0  0  0  0 10 11  1 12]
 [ 0  0  0  0  0  0  0  1 13 14]
 [ 0  2 15  3  4  2  5 16 17 18]
 [ 0  3  4  2 19 20 21  6 22 23]
 [ 5 24 25 26 27 28  6 29 30 31]]


## Extras

### Custom length Post padding

In [None]:
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
print(data)

[[ 7  8  1  9  0  0  0  0  0  0]
 [10 11  1 12  0  0  0  0  0  0]
 [ 1 13 14  0  0  0  0  0  0  0]
 [ 2 15  3  4  2  5 16 17 18  0]
 [ 3  4  2 19 20 21  6 22 23  0]
 [ 5 24 25 26 27 28  6 29 30 31]]


### Too much padding

In [None]:
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=12)
print(data)

[[ 0  0  0  0  0  0  0  0  7  8  1  9]
 [ 0  0  0  0  0  0  0  0 10 11  1 12]
 [ 0  0  0  0  0  0  0  0  0  1 13 14]
 [ 0  0  0  2 15  3  4  2  5 16 17 18]
 [ 0  0  0  3  4  2 19 20 21  6 22 23]
 [ 0  0  5 24 25 26 27 28  6 29 30 31]]


### Truncation

In [None]:
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=8)
print(data)

[[ 0  0  0  0  7  8  1  9]
 [ 0  0  0  0 10 11  1 12]
 [ 0  0  0  0  0  1 13 14]
 [15  3  4  2  5 16 17 18]
 [ 4  2 19 20 21  6 22 23]
 [25 26 27 28  6 29 30 31]]


In [None]:
data = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=8, truncating='post')
print(data)

[[ 0  0  0  0  7  8  1  9]
 [ 0  0  0  0 10 11  1 12]
 [ 0  0  0  0  0  1 13 14]
 [ 2 15  3  4  2  5 16 17]
 [ 3  4  2 19 20 21  6 22]
 [ 5 24 25 26 27 28  6 29]]
