In [None]:
#Intro to NLP and Text Processing with TensorFlow

In [None]:
#Text Encoding with Keras Tokenizer

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
# A sample sentence

sentences = ['TensorFlow is a Machine Learning framework',
             'Keras is a well designed deep learning API',
             'TensorFlow and Keras make a great machine learning ecosystem'
              
]

In [3]:
sentences

['TensorFlow is a Machine Learning framework',
 'Keras is a well designed deep learning API',
 'TensorFlow and Keras make a great machine learning ecosystem']

In [4]:
tokenizer = Tokenizer(num_words=1000, char_level=True)

# Fitting tokenizer on sentences
tokenizer.fit_on_texts(sentences)

In [5]:
char_index = tokenizer.word_index
print(char_index)

{' ': 1, 'e': 2, 'a': 3, 'n': 4, 'r': 5, 's': 6, 'i': 7, 'l': 8, 'o': 9, 'm': 10, 'g': 11, 't': 12, 'w': 13, 'k': 14, 'd': 15, 'f': 16, 'c': 17, 'h': 18, 'p': 19, 'y': 20}


In [6]:
tokenizer = Tokenizer(num_words=1000)

# Fitting tokenizer on sentences
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print(word_index)

{'a': 1, 'learning': 2, 'tensorflow': 3, 'is': 4, 'machine': 5, 'keras': 6, 'framework': 7, 'well': 8, 'designed': 9, 'deep': 10, 'api': 11, 'and': 12, 'make': 13, 'great': 14, 'ecosystem': 15}


In [7]:
word_counts = tokenizer.word_counts
word_counts

OrderedDict([('tensorflow', 2),
             ('is', 2),
             ('a', 3),
             ('machine', 2),
             ('learning', 3),
             ('framework', 1),
             ('keras', 2),
             ('well', 1),
             ('designed', 1),
             ('deep', 1),
             ('api', 1),
             ('and', 1),
             ('make', 1),
             ('great', 1),
             ('ecosystem', 1)])

In [9]:
 #Converting the Texts into Sequence of Tokens

In [10]:
sentences

['TensorFlow is a Machine Learning framework',
 'Keras is a well designed deep learning API',
 'TensorFlow and Keras make a great machine learning ecosystem']

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Redefining our sentence

sentences = ['TensorFlow is a Machine Learning framework',
             'Keras is a well designed deep learning API',
             'TensorFlow and Keras make a great machine learning ecosystem!',
             'TensorFlow is built on top of Keras',
             'TensorFlow revolves around tensors!'
              
]

tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# Converting text to sequences

text_sequences = tokenizer.texts_to_sequences(sentences)

In [12]:
print(f'Words with tokens: {word_index}')
print(f'Sequence of tokens: {text_sequences}')

Words with tokens: {'tensorflow': 1, 'is': 2, 'a': 3, 'learning': 4, 'keras': 5, 'machine': 6, 'framework': 7, 'well': 8, 'designed': 9, 'deep': 10, 'api': 11, 'and': 12, 'make': 13, 'great': 14, 'ecosystem': 15, 'built': 16, 'on': 17, 'top': 18, 'of': 19, 'revolves': 20, 'around': 21, 'tensors': 22}
Sequence of tokens: [[1, 2, 3, 6, 4, 7], [5, 2, 3, 8, 9, 10, 4, 11], [1, 12, 5, 13, 3, 14, 6, 4, 15], [1, 2, 16, 17, 18, 19, 5], [1, 20, 21, 22]]


In [13]:
tokenizer = Tokenizer(num_words=1000, oov_token='Word Out of Vocab')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

# Converting text to sequences

text_sequences = tokenizer.texts_to_sequences(sentences)

print(f'Words with tokens: {word_index}')
print(f'Sequence of tokens: {text_sequences}')

Words with tokens: {'Word Out of Vocab': 1, 'tensorflow': 2, 'is': 3, 'a': 4, 'learning': 5, 'keras': 6, 'machine': 7, 'framework': 8, 'well': 9, 'designed': 10, 'deep': 11, 'api': 12, 'and': 13, 'make': 14, 'great': 15, 'ecosystem': 16, 'built': 17, 'on': 18, 'top': 19, 'of': 20, 'revolves': 21, 'around': 22, 'tensors': 23}
Sequence of tokens: [[2, 3, 4, 7, 5, 8], [6, 3, 4, 9, 10, 11, 5, 12], [2, 13, 6, 14, 4, 15, 7, 5, 16], [2, 3, 17, 18, 19, 20, 6], [2, 21, 22, 23]]


In [14]:
new_sentences = ['I like TensorFlow', # like is a new word
                'Keras is a superb deep learning API' # superb is a new word
                
] 

sequences_on_newtexts = tokenizer.texts_to_sequences(new_sentences)
print(f'Sequence of tokens: {sequences_on_newtexts}')

Sequence of tokens: [[1, 1, 2], [6, 3, 4, 1, 11, 5, 12]]


In [15]:
 #Padding the Sequences to Have the Same Length

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Padding text_sequences 

padded_sequences = pad_sequences(text_sequences, maxlen=10)


print('WORD INDEX')
print(f'Words with tokens: {word_index}')
print("---------")


print('SEQUENCES')
print(f'Words with tokens: {text_sequences}')
print("---------")

print('PADDED SEQUENCES')
print(f'Sequence of tokens: {padded_sequences}')

WORD INDEX
Words with tokens: {'Word Out of Vocab': 1, 'tensorflow': 2, 'is': 3, 'a': 4, 'learning': 5, 'keras': 6, 'machine': 7, 'framework': 8, 'well': 9, 'designed': 10, 'deep': 11, 'api': 12, 'and': 13, 'make': 14, 'great': 15, 'ecosystem': 16, 'built': 17, 'on': 18, 'top': 19, 'of': 20, 'revolves': 21, 'around': 22, 'tensors': 23}
---------
SEQUENCES
Words with tokens: [[2, 3, 4, 7, 5, 8], [6, 3, 4, 9, 10, 11, 5, 12], [2, 13, 6, 14, 4, 15, 7, 5, 16], [2, 3, 17, 18, 19, 20, 6], [2, 21, 22, 23]]
---------
PADDED SEQUENCES
Sequence of tokens: [[ 0  0  0  0  2  3  4  7  5  8]
 [ 0  0  6  3  4  9 10 11  5 12]
 [ 0  2 13  6 14  4 15  7  5 16]
 [ 0  0  0  2  3 17 18 19 20  6]
 [ 0  0  0  0  0  0  2 21 22 23]]


In [17]:
padded_sequences = pad_sequences(text_sequences, maxlen=10, padding='post', value=-1)

print('PADDED SEQUENCES')
print(f'Sequence of tokens: {padded_sequences}')

PADDED SEQUENCES
Sequence of tokens: [[ 2  3  4  7  5  8 -1 -1 -1 -1]
 [ 6  3  4  9 10 11  5 12 -1 -1]
 [ 2 13  6 14  4 15  7  5 16 -1]
 [ 2  3 17 18 19 20  6 -1 -1 -1]
 [ 2 21 22 23 -1 -1 -1 -1 -1 -1]]


In [18]:
#Using TextVectorization Layer to Preprocess Texts

In [30]:
import tensorflow as tf

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [31]:
# Sample sentences

sentences = [
             'TensorFlow is a deep learning library!',
             'Is TensorFlow powered by Keras API?'
]

In [33]:
max_features = 1000

text_vect_layer = tf.keras.layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=10

)

AttributeError: module 'tensorflow.keras.layers' has no attribute 'TextVectorization'

In [26]:
 pip install tensorflow --upgrade

Collecting tensorflow
  Downloading tensorflow-2.8.0-cp38-cp38-manylinux2010_x86_64.whl (497.6 MB)
[K     |████████████████████████████████| 497.6 MB 119 bytes/s  0:00:01    |██                              | 32.4 MB 2.4 MB/s eta 0:03:18     |█████▌                          | 86.2 MB 1.1 MB/s eta 0:06:11     |██████▏                         | 95.8 MB 217 kB/s eta 0:30:45     |███████▉                        | 121.0 MB 1.0 MB/s eta 0:06:08     |████████▏                       | 127.2 MB 892 kB/s eta 0:06:56     |████████▊                       | 135.0 MB 746 kB/s eta 0:08:06     |██████████▎                     | 159.8 MB 1.4 MB/s eta 0:04:08     |██████████▍                     | 160.8 MB 1.4 MB/s eta 0:04:08     |████████████████▏               | 251.5 MB 858 kB/s eta 0:04:47     |████████████████████▏           | 313.3 MB 1.2 MB/s eta 0:02:33     |████████████████████▌           | 319.3 MB 1.3 MB/s eta 0:02:22     |█████████████████████▍          | 332.9 MB 1.4 MB/s eta 0:01:59     

Installing collected packages: tensorboard-data-server, numpy, tf-estimator-nightly, tensorflow-io-gcs-filesystem, tensorboard, libclang, keras, tensorflow
  Attempting uninstall: numpy
    Found existing installation: numpy 1.19.2
    Uninstalling numpy-1.19.2:
      Successfully uninstalled numpy-1.19.2
  Attempting uninstall: tensorboard
    Found existing installation: tensorboard 2.4.1
    Uninstalling tensorboard-2.4.1:
      Successfully uninstalled tensorboard-2.4.1
  Attempting uninstall: keras
    Found existing installation: Keras 2.4.3
    Uninstalling Keras-2.4.3:
      Successfully uninstalled Keras-2.4.3
  Attempting uninstall: tensorflow
    Found existing installation: tensorflow 2.4.1
    Uninstalling tensorflow-2.4.1:
      Successfully uninstalled tensorflow-2.4.1
Successfully installed keras-2.8.0 libclang-13.0.0 numpy-1.22.1 tensorboard-2.8.0 tensorboard-data-server-0.6.1 tensorflow-2.8.0 tensorflow-io-gcs-filesystem-0.23.1 tf-estimator-nightly-2.8.0.dev2021122109

In [34]:
text_vect_layer.adapt(sentences)

NameError: name 'text_vect_layer' is not defined

In [35]:
sample_sentence = 'Tensorflow is a machine learning framework!'

vectorized_sentence = text_vect_layer([sample_sentence])

print(f'Orginal sentence: \n {sample_sentence}')
print(f'Vectorized sentence: \n {vectorized_sentence}')

NameError: name 'text_vect_layer' is not defined