- Tokenization
- Sequencing 
- Padding
- Stemming
- Lemmatization


### Tokenization

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [3]:
sentence=["We love machine learning and deep learning"]
tokenizer=Tokenizer()
tokenizer.fit_on_texts(sentence)
tokenizer.word_index

{'learning': 1, 'we': 2, 'love': 3, 'machine': 4, 'and': 5, 'deep': 6}

In [4]:
# Tokenization is not case sensitive
# Keras Tokenizer class removes special Characters

sentence=["We love @ machine learning $ and DeeP....LearNing ,.. MACHINE and DEEP learning"]
tokenizer=Tokenizer()
tokenizer.fit_on_texts(sentence)
tokenizer.word_index

{'learning': 1, 'machine': 2, 'and': 3, 'deep': 4, 'we': 5, 'love': 6}

In [6]:
sentences=["We are learning tokenization","next we will learn sequencing","then Sequencing , Stemming and Lemmetization"]
tokenizer=Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'we': 1,
 'sequencing': 2,
 'are': 3,
 'learning': 4,
 'tokenization': 5,
 'next': 6,
 'will': 7,
 'learn': 8,
 'then': 9,
 'stemming': 10,
 'and': 11,
 'lemmetization': 12}

In [7]:
sentences=["We are learning tokenization 34",
           "next we will learn sequencing 5",
           "then Sequencing , Stemming and Lemmetization 5"]

tokenizer=Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'we': 1,
 'sequencing': 2,
 '5': 3,
 'are': 4,
 'learning': 5,
 'tokenization': 6,
 '34': 7,
 'next': 8,
 'will': 9,
 'learn': 10,
 'then': 11,
 'stemming': 12,
 'and': 13,
 'lemmetization': 14}

In [8]:
sentences=["We are learning tokenization 34",
           "next we will learn sequencing 5",
           "then Sequencing , Stemming and Lemmetization 5"]

tokenizer=Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n0123456789')
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'we': 1,
 'sequencing': 2,
 'are': 3,
 'learning': 4,
 'tokenization': 5,
 'next': 6,
 'will': 7,
 'learn': 8,
 'then': 9,
 'stemming': 10,
 'and': 11,
 'lemmetization': 12}

In [9]:
sentences=["We are learning tokenization 34",
           "next we will learn sequencing ",
           "then Sequencing , Stemming and Lemmetization 5"]

tokenizer=Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'we': 1,
 'sequencing': 2,
 'are': 3,
 'learning': 4,
 'tokenization': 5,
 '34': 6,
 'next': 7,
 'will': 8,
 'learn': 9,
 "'": 10,
 'then': 11,
 'stemming': 12,
 'and': 13,
 'lemmetization': 14,
 '5': 15}

### Sequencing

In [10]:
sentences = ['we are learning text preprocessing',
            'Tokenization refers to representing each word with a numerical token',
            'Sequencing refers to representing sentences as a sequence of tokens',
            'padding refers to adding zeros to make all sequences of same length']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer.word_index


{'to': 1,
 'refers': 2,
 'representing': 3,
 'a': 4,
 'of': 5,
 'we': 6,
 'are': 7,
 'learning': 8,
 'text': 9,
 'preprocessing': 10,
 'tokenization': 11,
 'each': 12,
 'word': 13,
 'with': 14,
 'numerical': 15,
 'token': 16,
 'sequencing': 17,
 'sentences': 18,
 'as': 19,
 'sequence': 20,
 'tokens': 21,
 'padding': 22,
 'adding': 23,
 'zeros': 24,
 'make': 25,
 'all': 26,
 'sequences': 27,
 'same': 28,
 'length': 29}

In [11]:
sequences=tokenizer.texts_to_sequences(sentences)
sequences

[[6, 7, 8, 9, 10],
 [11, 2, 1, 3, 12, 13, 14, 4, 15, 16],
 [17, 2, 1, 3, 18, 19, 4, 20, 5, 21],
 [22, 2, 1, 23, 24, 1, 25, 26, 27, 5, 28, 29]]

In [13]:
tokenizer.texts_to_sequences(['Text preprocessing involves tokenization ,sequencing, padding and more'])

[[9, 10, 11, 17, 22]]

In [14]:
tokenizer.texts_to_sequences(['Text preprocessing does not involves tokenization ,sequencing, padding and more'])

[[9, 10, 11, 17, 22]]

#### OOV(Out of Vocabulary ) Token

- Used to represent words which are not not in vocabulary

In [15]:
sentences = ['we are learning text preprocessing',
            'Tokenization refers to representing each word with a numerical token',
            'Sequencing refers to representing sentences as a sequence of tokens',
            'padding refers to adding zeros to make all sequences of same length']

tokenizer = Tokenizer(oov_token='#OOV')
tokenizer.fit_on_texts(sentences)
tokenizer.word_index


{'#OOV': 1,
 'to': 2,
 'refers': 3,
 'representing': 4,
 'a': 5,
 'of': 6,
 'we': 7,
 'are': 8,
 'learning': 9,
 'text': 10,
 'preprocessing': 11,
 'tokenization': 12,
 'each': 13,
 'word': 14,
 'with': 15,
 'numerical': 16,
 'token': 17,
 'sequencing': 18,
 'sentences': 19,
 'as': 20,
 'sequence': 21,
 'tokens': 22,
 'padding': 23,
 'adding': 24,
 'zeros': 25,
 'make': 26,
 'all': 27,
 'sequences': 28,
 'same': 29,
 'length': 30}

In [16]:
tokenizer.texts_to_sequences(['Text preprocessing involves tokenization ,sequencing, padding and more'])

[[10, 11, 1, 12, 18, 23, 1, 1]]

In [17]:
tokenizer.texts_to_sequences(['Text preprocessing does not involves tokenization ,sequencing, padding and more'])

[[10, 11, 1, 1, 1, 12, 18, 23, 1, 1]]

### Padding

In [18]:
sentences = ['We love machine learning',
             'We are learning tokenization',
             'we are learning sequencing',
             'We are learning the cocept of padding',
             'Machine learning and deep learning are fun',
             'We are fortunate to learn from the best trainer']

tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

sequences


[[3, 7, 5, 2],
 [3, 4, 2, 8],
 [3, 4, 2, 9],
 [3, 4, 2, 6, 10, 11, 12],
 [5, 2, 13, 14, 2, 4, 15],
 [3, 4, 16, 17, 18, 19, 6, 20, 21]]

In [19]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_seq = pad_sequences(sequences)
padded_seq

array([[ 0,  0,  0,  0,  0,  3,  7,  5,  2],
       [ 0,  0,  0,  0,  0,  3,  4,  2,  8],
       [ 0,  0,  0,  0,  0,  3,  4,  2,  9],
       [ 0,  0,  3,  4,  2,  6, 10, 11, 12],
       [ 0,  0,  5,  2, 13, 14,  2,  4, 15],
       [ 3,  4, 16, 17, 18, 19,  6, 20, 21]])

In [20]:
padded_seq = pad_sequences(sequences,padding="post") # or use 'pre'
padded_seq

array([[ 3,  7,  5,  2,  0,  0,  0,  0,  0],
       [ 3,  4,  2,  8,  0,  0,  0,  0,  0],
       [ 3,  4,  2,  9,  0,  0,  0,  0,  0],
       [ 3,  4,  2,  6, 10, 11, 12,  0,  0],
       [ 5,  2, 13, 14,  2,  4, 15,  0,  0],
       [ 3,  4, 16, 17, 18, 19,  6, 20, 21]])

In [21]:
sentences = ['We love machine learning',
             'We are learning tokenization',
             'we are learning sequencing',
             'We are learning the cocept of padding',
             'Machine learning and deep learning are fun',
             'We are fortunate to learn from the best trainer',
             'The main goal behing text preprocessing is to represent text in a numerical format']

tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded_seq = pad_sequences(sequences, padding = 'pre')
padded_seq


array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  9,  6,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  0,  0,  0,  0,  0,  0,  3,  4,  2,  5, 12, 13, 14],
       [ 0,  0,  0,  0,  0,  0,  0,  6,  2, 15, 16,  2,  4, 17],
       [ 0,  0,  0,  0,  0,  3,  4, 18,  7, 19, 20,  5, 21, 22],
       [ 5, 23, 24, 25,  8, 26, 27,  7, 28,  8, 29, 30, 31, 32]])

In [23]:
tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded_seq = pad_sequences(sequences, padding = 'pre', maxlen = 8)
padded_seq


array([[ 0,  0,  0,  0,  3,  9,  6,  2],
       [ 0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  3,  4,  2,  5, 12, 13, 14],
       [ 0,  6,  2, 15, 16,  2,  4, 17],
       [ 4, 18,  7, 19, 20,  5, 21, 22],
       [27,  7, 28,  8, 29, 30, 31, 32]])

In [24]:
tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded_seq = pad_sequences(sequences, padding = 'pre', maxlen = 8,truncating='post')
padded_seq


array([[ 0,  0,  0,  0,  3,  9,  6,  2],
       [ 0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  3,  4,  2,  5, 12, 13, 14],
       [ 0,  6,  2, 15, 16,  2,  4, 17],
       [ 3,  4, 18,  7, 19, 20,  5, 21],
       [ 5, 23, 24, 25,  8, 26, 27,  7]])

In [25]:
tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
padded_seq = pad_sequences(sequences, padding = 'post', maxlen = 8,truncating='post')
padded_seq


array([[ 3,  9,  6,  2,  0,  0,  0,  0],
       [ 3,  4,  2, 10,  0,  0,  0,  0],
       [ 3,  4,  2, 11,  0,  0,  0,  0],
       [ 3,  4,  2,  5, 12, 13, 14,  0],
       [ 6,  2, 15, 16,  2,  4, 17,  0],
       [ 3,  4, 18,  7, 19, 20,  5, 21],
       [ 5, 23, 24, 25,  8, 26, 27,  7]])