In [1]:
import string
class Vectorizer:
 def standardize(self, text):
  text = text.lower()
  return "".join(char for char in text
   if char not in string.punctuation)
 def tokenize(self, text):
  text = self.standardize(text)
  return text.split()
 def make_vocabulary(self, dataset):
  self.vocabulary = {"": 0, "[UNK]": 1}
  for text in dataset:
   text = self.standardize(text)
   tokens = self.tokenize(text)
   for token in tokens:
    if token not in self.vocabulary:
     self.vocabulary[token] = len(self.vocabulary)
  self.inverse_vocabulary = dict(
   (v, k) for k, v in self.vocabulary.items())
 def encode(self, text):
  text = self.standardize(text)
  tokens = self.tokenize(text)
  return [self.vocabulary.get(token, 1) for token in tokens]
 def decode(self, int_sequence):
  return " ".join(
   self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)
vectorizer = Vectorizer()
dataset = [
 "I write, erase, rewrite",
 "Erase again, and then",
 "A poppy blooms.",
]
vectorizer.make_vocabulary(dataset)

In [2]:
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = vectorizer.encode(test_sentence)
print(encoded_sentence)

[2, 3, 5, 7, 1, 5, 6]


In [3]:
from tensorflow.keras.layers import TextVectorization
text_vectorization = TextVectorization(
 output_mode="int",
)

In [4]:
import re
import string
import tensorflow as tf
def custom_standardization_fn(string_tensor):
  lowercase_string = tf.strings.lower(string_tensor)
  return tf.strings.regex_replace(
    lowercase_string, f"[{re.escape(string.punctuation)}]", "")
def custom_split_fn(string_tensor):
  return tf.strings.split(string_tensor)
text_vectorization = TextVectorization(
  output_mode="int",
  standardize=custom_standardization_fn,
  split=custom_split_fn,
)

In [5]:
dataset = [
  "I write, erase, rewrite",
  "Erase again, and then",
  "A poppy blooms.",
]
text_vectorization.adapt(dataset)
text_vectorization.get_vocabulary()


['',
 '[UNK]',
 'erase',
 'write',
 'then',
 'rewrite',
 'poppy',
 'i',
 'blooms',
 'and',
 'again',
 'a']

In [6]:
vocabulary = text_vectorization.get_vocabulary()
test_sentence = "I write, rewrite, and still rewrite again"
encoded_sentence = text_vectorization(test_sentence)
print(encoded_sentence)

tf.Tensor([ 7  3  5  9  1  5 10], shape=(7,), dtype=int64)


In [7]:
inverse_vocab = dict(enumerate(vocabulary))
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence)
print(decoded_sentence)

i write rewrite and [UNK] rewrite again


In [8]:
from tensorflow import keras
batch_size = 32
train_ds = keras.utils.text_dataset_from_directory(
  "aclImdb/train", batch_size=batch_size
)
val_ds = keras.utils.text_dataset_from_directory(
  "aclImdb/val", batch_size=batch_size
)
test_ds = keras.utils.text_dataset_from_directory(
  "aclImdb/test", batch_size=batch_size
)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [9]:
for inputs, targets in train_ds:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print("targets[0]:", targets[0])
  break

inputs.shape: (32,)
inputs.dtype: <dtype: 'string'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor(b'When I saw the preview, I thought: this is going to be a great movie. And indeed it could have been. The actress playing the main character was very credible, and the beauty of the filming is undeniable. However the dialogues cast a dark shadow on the whole picture. The level of language was too familiar and too contemporary for an action taking place in 1610, and it took away most of the magic of the film. However, I must congratulate the translator, because the English sub-titles were more refined and appropriate that the original French cues, and it probably explains the good rating the movie received on the imbd!', shape=(), dtype=string)
targets[0]: tf.Tensor(0, shape=(), dtype=int32)


In [10]:
text_vectorization = TextVectorization(
  max_tokens=20000,
  output_mode="multi_hot",
)
text_only_train_ds = train_ds.map(lambda x, y: x)
text_vectorization.adapt(text_only_train_ds)
binary_1gram_train_ds = train_ds.map(
  lambda x, y: (text_vectorization(x), y),
  num_parallel_calls=4)
binary_1gram_val_ds = val_ds.map(
  lambda x, y: (text_vectorization(x), y),
  num_parallel_calls=4)
binary_1gram_test_ds = test_ds.map(
  lambda x, y: (text_vectorization(x), y),
  num_parallel_calls=4)

In [11]:
for inputs, targets in binary_1gram_train_ds:
  print("inputs.shape:", inputs.shape)
  print("inputs.dtype:", inputs.dtype)
  print("targets.shape:", targets.shape)
  print("targets.dtype:", targets.dtype)
  print("inputs[0]:", inputs[0])
  print("targets[0]:", targets[0])
  break

inputs.shape: (32, 20000)
inputs.dtype: <dtype: 'float32'>
targets.shape: (32,)
targets.dtype: <dtype: 'int32'>
inputs[0]: tf.Tensor([0. 1. 1. ... 0. 0. 0.], shape=(20000,), dtype=float32)
targets[0]: tf.Tensor(0, shape=(), dtype=int32)


In [13]:
from tensorflow import keras
from tensorflow.keras import layers
def get_model(max_tokens=20000, hidden_dim=16):
  inputs = keras.Input(shape=(max_tokens,))
  x = layers.Dense(hidden_dim, activation="relu")(inputs)
  x = layers.Dropout(0.5)(x)
  outputs = layers.Dense(1, activation="sigmoid")(x)
  model = keras.Model(inputs, outputs)
  model.compile(optimizer="rmsprop",
    loss="binary_crossentropy",
    metrics=["accuracy"])
  return model

In [14]:
model = get_model()
model.summary()
callbacks = [
  keras.callbacks.ModelCheckpoint("binary_1gram.tf",
                                  save_best_only=True,
                                  save_format="tf")
]
model.fit(binary_1gram_train_ds.cache(),
  validation_data=binary_1gram_val_ds.cache(),
  epochs=10,
  callbacks=callbacks)
model = keras.models.load_model("binary_1gram.tf")
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}")

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense (Dense)               (None, 16)                320016    
                                                                 
 dropout (Dropout)           (None, 16)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10


INFO:tensorflow:Assets written to: binary_1gram.tf/assets


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.885


In [15]:
text_vectorization = TextVectorization(
  ngrams=2,
  max_tokens=20000,
  output_mode="multi_hot",
)

In [16]:
text_vectorization.adapt(text_only_train_ds)
binary_2gram_train_ds = train_ds.map(
  lambda x, y: (text_vectorization(x), y),
  num_parallel_calls=4)
binary_2gram_val_ds = val_ds.map(
  lambda x, y: (text_vectorization(x), y),
  num_parallel_calls=4)
binary_2gram_test_ds = test_ds.map(
  lambda x, y: (text_vectorization(x), y),
  num_parallel_calls=4)
model = get_model()
model.summary()
callbacks = [
  keras.callbacks.ModelCheckpoint("binary_2gram.tf",save_best_only=True)
]
model.fit(binary_2gram_train_ds.cache(),
  validation_data=binary_2gram_val_ds.cache(),
  epochs=10,
  callbacks=callbacks)
model = keras.models.load_model("binary_2gram.tf")
print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}")

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_2 (Dense)             (None, 16)                320016    
                                                                 
 dropout_1 (Dropout)         (None, 16)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10


INFO:tensorflow:Assets written to: binary_2gram.tf/assets


Epoch 2/10


INFO:tensorflow:Assets written to: binary_2gram.tf/assets


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.894


In [17]:
text_vectorization = TextVectorization(
 ngrams=2,
 max_tokens=20000,
 output_mode="count"
)

In [18]:
def tfidf(term, document, dataset):
  term_freq = document.count(term)
  doc_freq = math.log(sum(doc.count(term) for doc in dataset) + 1)
  return term_freq / doc_freq

In [19]:
text_vectorization = TextVectorization(
  ngrams=2,
  max_tokens=20000,
  output_mode="tf_idf",
)

In [22]:
text_vectorization.adapt(text_only_train_ds)
tfidf_2gram_train_ds = train_ds.map(
  lambda x, y: (text_vectorization(x), y),
  num_parallel_calls=4)
tfidf_2gram_val_ds = val_ds.map(
  lambda x, y: (text_vectorization(x), y),
  num_parallel_calls=4)
tfidf_2gram_test_ds = test_ds.map(
  lambda x, y: (text_vectorization(x), y),
  num_parallel_calls=4)
model = get_model()
model.summary()
callbacks = [
  keras.callbacks.ModelCheckpoint("tfidf_2gram.tf",
   save_best_only=True)
]
model.fit(tfidf_2gram_train_ds.cache(),
  validation_data=tfidf_2gram_val_ds.cache(),
  epochs=10,
  callbacks=callbacks)
model = keras.models.load_model("tfidf_2gram.tf")
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}")

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 20000)]           0         
                                                                 
 dense_8 (Dense)             (None, 16)                320016    
                                                                 
 dropout_4 (Dropout)         (None, 16)                0         
                                                                 
 dense_9 (Dense)             (None, 1)                 17        
                                                                 
Total params: 320033 (1.22 MB)
Trainable params: 320033 (1.22 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/10


INFO:tensorflow:Assets written to: tfidf_2gram.tf/assets


Epoch 2/10


INFO:tensorflow:Assets written to: tfidf_2gram.tf/assets


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.897


In [33]:
from tensorflow.keras import layers
max_length = 600
max_tokens = 20000
text_vectorization = layers.TextVectorization(
  max_tokens=max_tokens,
  output_mode="int",
  output_sequence_length=max_length
)
text_vectorization.adapt(text_only_train_ds)
int_train_ds = train_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_val_ds = val_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)
int_test_ds = test_ds.map(
    lambda x, y: (text_vectorization(x), y),
    num_parallel_calls=4)

In [35]:
import tensorflow as tf
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = tf.one_hot(inputs, depth=max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
  loss="binary_crossentropy",
  metrics=["accuracy"])
model.summary()  

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_8 (InputLayer)        [(None, None)]            0         
                                                                 
 tf.one_hot (TFOpLambda)     (None, None, 20000)       0         
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                5128448   
 onal)                                                           
                                                                 
 dropout_6 (Dropout)         (None, 64)                0         
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5128513 (19.56 MB)
Trainable params: 5128513 (19.56 MB)
Non-trainable params: 0 (0.00 Byte)
___________________

In [36]:
callbacks = [
  keras.callbacks.ModelCheckpoint("one_hot_bidir_lstm.tf",
    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,
  callbacks=callbacks)
model = keras.models.load_model("one_hot_bidir_lstm.tf")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/10


INFO:tensorflow:Assets written to: one_hot_bidir_lstm.tf/assets


Epoch 2/10
 13/625 [..............................] - ETA: 13:32 - loss: 0.3787 - accuracy: 0.8510

KeyboardInterrupt: 

In [37]:
inputs = keras.Input(shape=(None,), dtype="int64")
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
  loss="binary_crossentropy",
  metrics=["accuracy"])
model.summary()
callbacks = [
  keras.callbacks.ModelCheckpoint("embeddings_bidir_gru.tf",save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=10,
  callbacks=callbacks)
model = keras.models.load_model("embeddings_bidir_gru.tf")
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_9 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 256)         5120000   
                                                                 
 bidirectional_2 (Bidirecti  (None, 64)                73984     
 onal)                                                           
                                                                 
 dropout_7 (Dropout)         (None, 64)                0         
                                                                 
 dense_12 (Dense)            (None, 1)                 65        
                                                                 
Total params: 5194049 (19.81 MB)
Trainable params: 5194049 (19.81 MB)
Non-trainable params: 0 (0.00 Byte)
___________________

INFO:tensorflow:Assets written to: embeddings_bidir_gru.tf/assets


Epoch 2/10


INFO:tensorflow:Assets written to: embeddings_bidir_gru.tf/assets


Epoch 3/10


INFO:tensorflow:Assets written to: embeddings_bidir_gru.tf/assets


Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test acc: 0.870


In [39]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
class TransformerEncoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim
    self.dense_dim = dense_dim
    self.num_heads = num_heads
    self.attention = layers.MultiHeadAttention(
      num_heads=num_heads, key_dim=embed_dim)
    self.dense_proj = keras.Sequential(
      [layers.Dense(dense_dim, activation="relu"),
       layers.Dense(embed_dim),]
)
    self.layernorm_1 = layers.LayerNormalization()
    self.layernorm_2 = layers.LayerNormalization()
  def call(self, inputs, mask=None):
    if mask is not None:
      mask = mask[:, tf.newaxis, :]
    attention_output = self.attention(
      inputs, inputs, attention_mask=mask)
    proj_input = self.layernorm_1(inputs + attention_output)
    proj_output = self.dense_proj(proj_input)
    return self.layernorm_2(proj_input + proj_output)
  def get_config(self):
    config = super().get_config()
    config.update({
      "embed_dim": self.embed_dim,
      "num_heads": self.num_heads,
      "dense_dim": self.dense_dim,
    })
    return config

In [40]:
vocab_size = 20000
embed_dim = 256
num_heads = 2
dense_dim = 32
inputs = keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(vocab_size, embed_dim)(inputs)
x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.compile(optimizer="rmsprop",
  loss="binary_crossentropy",
  metrics=["accuracy"])
model.summary()

Model: "model_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_11 (InputLayer)       [(None, None)]            0         
                                                                 
 embedding_3 (Embedding)     (None, None, 256)         5120000   
                                                                 
 transformer_encoder (Trans  (None, None, 256)         543776    
 formerEncoder)                                                  
                                                                 
 global_max_pooling1d (Glob  (None, 256)               0         
 alMaxPooling1D)                                                 
                                                                 
 dropout_8 (Dropout)         (None, 256)               0         
                                                                 
 dense_15 (Dense)            (None, 1)                 257 

In [42]:
callbacks = [
  keras.callbacks.ModelCheckpoint("transformer_encoder.tf",
    save_best_only=True)
]
model.fit(int_train_ds, validation_data=int_val_ds, epochs=20,
    callbacks=callbacks)
model = keras.models.load_model(
  "transformer_encoder.tf",
  custom_objects={"TransformerEncoder": TransformerEncoder})
print(f"Test acc: {model.evaluate(int_test_ds)[1]:.3f}")

Epoch 1/20

ValueError: The following argument(s) are not supported with the native Keras format: ['options']