# Word Encodings

The notebooks explains the implementation of word encodings in NLP using the tensorflow library.

## Import libraries and APIs

In [1]:
## import the tensorflow APIs
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Define training sentences

In [2]:
train_sentences = [
             'It will rain',
             'The weather is cloudy!',
             'Will it be raining today?',
             'It is a super hot day!',
]

## Set up the tokenizer

In [3]:
##instantiate the tokenizer
tokenizer = Tokenizer(num_words=100)

##train the tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

##store word index for the words in the sentence
word_index = tokenizer.word_index


In [4]:
print(word_index)

{'it': 1, 'will': 2, 'is': 3, 'rain': 4, 'the': 5, 'weather': 6, 'cloudy': 7, 'be': 8, 'raining': 9, 'today': 10, 'a': 11, 'super': 12, 'hot': 13, 'day': 14}


## Create sequences

In [5]:
##create sequences using tokenizer
sequences = tokenizer.texts_to_sequences(train_sentences)

In [6]:
##print word index dictionary and sequences
print(f"Word index -->{word_index}")
print(f"Sequences of words -->{sequences}")

Word index -->{'it': 1, 'will': 2, 'is': 3, 'rain': 4, 'the': 5, 'weather': 6, 'cloudy': 7, 'be': 8, 'raining': 9, 'today': 10, 'a': 11, 'super': 12, 'hot': 13, 'day': 14}
Sequences of words -->[[1, 2, 4], [5, 6, 3, 7], [2, 1, 8, 9, 10], [1, 3, 11, 12, 13, 14]]


In [7]:
##print sample sentence and sequence
print(train_sentences[0])
print(sequences[0])

It will rain
[1, 2, 4]


## Tokenizing new data using the same tokenizer

In [8]:
new_sentences = [
                 'Will it be raining today?',
                 'It is a pleasant day.'
]

new_sequences = tokenizer.texts_to_sequences(new_sentences) # Be carefull, there is missing values because they are first time appear

print(new_sentences)
print(new_sequences)

['Will it be raining today?', 'It is a pleasant day.']
[[2, 1, 8, 9, 10], [1, 3, 11, 14]]


## Replacing newly encountered words with special values | Train the tokenizer

In [9]:
##set up the tokenizer again with oov_token
tokenizer = Tokenizer(num_words=100, oov_token= "<oov>")

##train the new tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

##store word index for the words in the sentence
word_index = tokenizer.word_index

In [10]:
##create sequences of the new sentences
new_sequences = tokenizer.texts_to_sequences(new_sentences)
print(word_index)
print(new_sequences)

{'<oov>': 1, 'it': 2, 'will': 3, 'is': 4, 'rain': 5, 'the': 6, 'weather': 7, 'cloudy': 8, 'be': 9, 'raining': 10, 'today': 11, 'a': 12, 'super': 13, 'hot': 14, 'day': 15}
[[3, 2, 9, 10, 11], [2, 4, 12, 1, 15]]


## Pad Sequences

In [11]:
##pad sequences
padded_seqs = pad_sequences(sequences)

In [12]:
print(word_index)
print(train_sentences)
print(sequences)
print(padded_seqs)

{'<oov>': 1, 'it': 2, 'will': 3, 'is': 4, 'rain': 5, 'the': 6, 'weather': 7, 'cloudy': 8, 'be': 9, 'raining': 10, 'today': 11, 'a': 12, 'super': 13, 'hot': 14, 'day': 15}
['It will rain', 'The weather is cloudy!', 'Will it be raining today?', 'It is a super hot day!']
[[1, 2, 4], [5, 6, 3, 7], [2, 1, 8, 9, 10], [1, 3, 11, 12, 13, 14]]
[[ 0  0  0  1  2  4]
 [ 0  0  5  6  3  7]
 [ 0  2  1  8  9 10]
 [ 1  3 11 12 13 14]]


## Customising your padded sequence with parameters

In [13]:
##pad sequences with padding type, max length and truncating parameters
padded_seqs = pad_sequences(sequences,
                            padding="post",
                            maxlen=5,
                            truncating="post",
                            )

In [14]:
print(padded_seqs)

[[ 1  2  4  0  0]
 [ 5  6  3  7  0]
 [ 2  1  8  9 10]
 [ 1  3 11 12 13]]
