# Text Preprocessing with Keras

In [None]:
# importing libraries

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D
import tsensor
import numpy as np

## Tokenization

In [None]:
# Tokenising sentences
sentences = [
    'The quick brown fox jumps over the lazy dog.'
]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

In [None]:
tokenizer.word_index

In [None]:
train_sequence = tokenizer.texts_to_sequences(sentences)
train_sequence = np.array(train_sequence)
print(train_sequence)

![](images/Emb4.png)

## Creating Embedding Layer

In [None]:
# Create a random embedding layer

embedding = Embedding(input_dim=MAX_SEQUENCE_LENGTH, output_dim=128)

In [None]:
# Get the embeddings of the train sample

train_sample = embedding(train_sequence)

In [None]:
train_sequence.shape

In [None]:
train_sample.shape

In [None]:
with tsensor.explain(fontname='Hack', dimfontname='Hack'):
    train_sample = embedding(train_sequence)

In [None]:
train_sample[0]

## Averaging across tokens

In [None]:

GlobalAveragePooling1D()(train_sample)

![](images/Emb6.png)

In [None]:
with tsensor.explain(fontname='Hack', dimfontname='Hack'):
    z = GlobalAveragePooling1D()(train_sample)

## Creating Word Embeddings for more than one sentence

In [None]:
# More than one sentence

test_corpus = [
    'The quick brown fox jumps over the lazy dog.',
    'The quick brown fox.',
    'The lazy dog.',
    'The dog.',
    'Dog and the fox.',
    'Hello, world!'
]
encoded_sentences = tokenizer.texts_to_sequences(test_corpus)
for sentence, encoded_sentence in zip(test_corpus, encoded_sentences):
    print(sentence, encoded_sentence)

## Padding Sequences

In [None]:
# Length of each sentence in the corpus

[len(sentence) for sentence in encoded_sentences]

In [None]:
# Length of the longest sentence

max([len(sentence) for sentence in encoded_sentences])

In [None]:
MAX_SEQUENCE_LENGTH = 9

In [None]:
# Padding sequences that are shorter than the longest sequence

X = pad_sequences(encoded_sentences, maxlen=MAX_SEQUENCE_LENGTH)
X

## Embedding Layer

In [None]:
# Training data with more than 1 sentences

X

In [None]:
X.shape

In [None]:
# Embeddings of the larger corpus

X_embedded = embedding(X)

In [None]:
X_embedded.shape

In [None]:
X_embedded

In [None]:
with tsensor.explain(fontname='Hack', dimfontname='Hack'):
    x_em = embedding(X)

In [None]:
X.shape

In [None]:
x_em.shape

## Averaging across tokens

![](images/Emb6.png)

![](images/Emb7.png)

![](images/Emb8.png)

In [None]:
with tsensor.explain(fontname='Hack', dimfontname='Hack'):
    z = GlobalAveragePooling1D()(x_em)

In [None]:
z.shape