# How to the TextVectorization Layer in TensorFLow

In [1]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization

In [2]:
# Instanting
text_vectorization = TextVectorization() # TextVectorization removes text lowercase transliteration and punctuation. And it splits the text into words. 

In [3]:
data = [
    "Bugün hava çok güzel", # eng: Today the weather is very nice
    "Ali, Efe ve baki çay içecek", # eng: Ali, Efe and Baki will drink tea
    "Selam söyle" # eng: Say hello
]

In [4]:
# Creating the vocabulary with the adapt method.
text_vectorization.adapt(data)

In [5]:
# Let's take a look at the vocabulary.
text_vectorization.get_vocabulary() # The first two words are reserved for padding and out-of-vocabulary tokens. Out-of vocabulary means that the word is not in the vocabulary. Padding is used to make the input data the same size.

['',
 '[UNK]',
 'çok',
 'çay',
 've',
 'söyle',
 'selam',
 'içecek',
 'hava',
 'güzel',
 'efe',
 'ece',
 'bugün',
 'ali']

In [6]:
# Data preprocessing with the layer
vectorized_text = text_vectorization(data)  
vectorized_text # The first two words are reserved for padding and out-of-vocabulary tokens. Out-of vocabulary means that the word is not in the vocabulary. Padding is used to make the input data the same size.

<tf.Tensor: shape=(3, 6), dtype=int64, numpy=
array([[12,  8,  2,  9,  0,  0],
       [13, 10,  4, 11,  3,  7],
       [ 6,  5,  0,  0,  0,  0]])>

# Using the custom functions TextVectorization

In [7]:
import re
import string

In [8]:
def standardization_fn(string_tensor): # The standardization function removes punctuation and lowercases the text.
  lowercase = tf.strings.lower(string_tensor)
  return tf.strings.regex_replace(
      lowercase, f"[{re.escape(string.punctuation)}]", "" # The re.escape() function returns a string with all non-alphanumerics backslashed; this is useful if you want to match an arbitrary literal string that may have regular expression metacharacters in it.
  )

In [9]:
def split_fn(string_tensor): # The split function splits the string into words.
  return tf.strings.split(string_tensor)

In [10]:
text_vectorization = TextVectorization(
    standardize=standardization_fn,
    split = split_fn
)

In [11]:
text_vectorization.adapt(data)

In [12]:
# Testing our layer with a text
text = "bugün ece çok güzel" # eng: today ece is very nice
text_vectorization(text)

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([12, 11,  2,  9])>

# Using TextVectorization in a model

In [13]:
# Creating a Dataset object
text_dataset = tf.data.Dataset.from_tensor_slices([
    "kedi", "aslan", "yunus" # eng: cat, lion, dolphin
])

In [14]:
# Creating the TextVectorization layer. The output_sequence_length parameter is used to make the output data the same size.
vectorize_layer = tf.keras.layers.TextVectorization( # The TextVectorization layer removes text lowercase transliteration and punctuation. And it splits the text into words.
    max_tokens=5000, # The maximum number of words in the vocabulary
    output_sequence_length=4 # The output data size. Array size. Padding is used to make the input data the same size.
)

In [15]:
# Creating the vocabulary
vectorize_layer.adapt(text_dataset.batch(64))

In [16]:
vectorize_layer.get_vocabulary()

['', '[UNK]', 'yunus', 'kedi', 'aslan']

In [17]:
# Building the model
model = tf.keras.models.Sequential([
    tf.keras.Input(shape=(1,), dtype=tf.string), # The input shape is the shape of the input data. The input data is a string.
    vectorize_layer
])

In [18]:
# Getting a data for testing
input_data=[["kedi kartal aslan"], ["fok yunus"]]

In [19]:
model.predict(input_data)



array([[3, 1, 4, 0],
       [1, 2, 0, 0]])

array([[3 (kedi), 1 (kartal), 4 (aslan), 0 (padding)],
       [1, 2, 0, 0]])
Padding (0) is used to make the input data the same size. 1 means out of vocabulary 