In [None]:
import tensorflow as tf
from tensorflow import keras
import os
import warnings
from tensorflow.python.util import deprecation

In [26]:
tf.get_logger().setLevel('ERROR')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning, module='tensorflow')
warnings.filterwarnings('ignore', category=UserWarning, module='tensorflow.core.framework.local_rendezvous')
deprecation._PRINT_DEPRECATION_WARNINGS = False

In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### Loading and Preprocessing Data for Custom Models

The following functions read and preprocess the CSV data on file. The `parse_csv_line()` function works on each line and separates the review and the label while ensuring consistency. The `preprocess()` function also works on each element of the dataset iteratively and vectorizes the review text using an adapted vectorizer. In the `prepare_dataset()` function, we combine both of these functionalities, plus adapt our vectorizer on a subset of the data, and finally return a shuffled, batched, and prefetched dataset along with the adapted vectorizer. Lastly, we save both of these for later use. While the functions look clean and simple, a lot of experimentation went under the hoods which I have cleaned up.

In [162]:
def parse_csv_line(line):
    parts = tf.strings.split(line, sep=',')
    label = tf.strings.to_number(parts[-1], out_type=tf.int32)
    review = tf.strings.reduce_join(parts[:-1], separator=',')
    return review, label

def preprocess(text, label, vectorizer):
    # text = tf.expand_dims(text, axis=-1)  # Add an extra dimension for the vectorizer
    text = vectorizer(text)
    return text, label

In [163]:
def prepare_dataset_from_csv(filename, header=True, sample_size=5000, max_tokens=20000, output_sequence_length=500, batch_size=32):
    
    dataset = tf.data.TextLineDataset([filename], num_parallel_reads=tf.data.AUTOTUNE)
        
    # Skip header if present
    if header:
        dataset = dataset.skip(1)
    
    dataset = dataset.map(parse_csv_line, num_parallel_calls=tf.data.AUTOTUNE)
    
    # Create and adapt the TextVectorization layer
    vectorizer = keras.layers.TextVectorization(max_tokens=max_tokens, output_sequence_length=output_sequence_length)
    vectorizer.adapt(dataset.take(sample_size).map(lambda x, y: x))

    # Preprocess text
    dataset = dataset.map(lambda text, label: preprocess(text, label, vectorizer), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return dataset, vectorizer

In [164]:
file_path = 'data/movie.csv'
dataset, vectorizer = prepare_dataset_from_csv(file_path)

2024-07-24 21:30:44.372711: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [165]:
for i in dataset.take(2).as_numpy_iterator():
    print(i[0].shape, i[1].shape)


(32, 500) (32,)
(32, 500) (32,)


2024-07-24 21:30:44.808068: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


#### Saving the Dataset
I am saving the preprocessed dataset as a TFRecords file.

In [167]:
def _create_feature(value, _type):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
        
    if _type=='label':
        value = [value]   # Because Int64List accepts a list, but label is a scalar
    elif _type=='review':
        value = value.tolist()
        
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _serialize_example(review, label):
    
    feature = {
        'review': _create_feature(review, _type='review'),
        'label': _create_feature(label, _type='label')
    }
    
    example_review = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_review.SerializeToString()

def save_to_tfr(dataset, out_file='data/dataset_prepared.tfrecord'):
    with tf.io.TFRecordWriter(out_file) as writer:
        for batch in dataset:
            reviews, labels = batch
            for review, label in zip(reviews, labels):
                serialized_example = _serialize_example(review.numpy(), label.numpy())
                writer.write(serialized_example)

In [168]:
save_to_tfr(dataset)

2024-07-24 21:31:14.474807: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


The dataset has been saved and can be loaded as follows. Since after storing as TFR, all the in-memory transformations like shuffling and batching are lost, we need to re-apply those.

In [2]:
feature_description = {
    'review': tf.io.FixedLenFeature([500], tf.int64),
    'label': tf.io.FixedLenFeature([], tf.int64)
}

In [3]:
def prepare_dataset_from_tfr(filename, feature_description=feature_description, batch_size=16, shuffle_buffer_size=10000):
    raw_dataset = tf.data.TFRecordDataset([filename])
    
    def _parse_examples(example_review):      # To parse examples
        return tf.io.parse_single_example(example_review, feature_description)
    
    dataset = raw_dataset.map(_parse_examples)
    dataset = dataset.map(lambda example_dict: (example_dict['review'], example_dict['label']))
    dataset = dataset.shuffle(shuffle_buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [6]:
dataset = prepare_dataset_from_tfr('data/dataset_prepared.tfrecord')
for i in dataset.take(1):
    print(i)

(<tf.Tensor: shape=(16, 500), dtype=int64, numpy=
array([[4723,   77,   74, ...,    0,    0,    0],
       [  10,   20,   17, ...,    0,    0,    0],
       [  22,   96,   72, ...,    0,    0,    0],
       ...,
       [ 197,  105,   10, ...,    0,    0,    0],
       [   2,  386,    5, ...,    0,    0,    0],
       [ 109,  158,   25, ...,    0,    0,    0]])>, <tf.Tensor: shape=(16,), dtype=int64, numpy=array([1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0])>)


2024-07-25 14:28:59.011219: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


All the preprocessing steps have been done and the dataset is now ready to be trained on. We shall use 4 different models for this purpose:
1. A simple Keras custom DNN with Batch Normalization, Dropout, and other advanced techniques as needed,
2. A simple RNN,
3. An LSTM or GRU, and
4. Transfer Learning using GPT from HuggingFace Transformers using pretrained tokenizers and embeddings.

**1. Custom Deep Neural Network**

In [7]:
BATCHES = 40000/32
TRAIN_SIZE = int(BATCHES*0.8)
train_dataset = dataset.take(TRAIN_SIZE)
test_dataset = dataset.skip(TRAIN_SIZE)

In [24]:
vocab_size = 20000  
embedding_dim = 128  
input_length = 500  

model_1_dnn = keras.models.Sequential([
    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
    keras.layers.Flatten(),
    
    keras.layers.Dense(128, activation='relu'),
    keras.layers.BatchNormalization(),  
    keras.layers.Dropout(0.1),          
    
    keras.layers.Dense(64, activation='relu'),
    keras.layers.BatchNormalization(),  
    keras.layers.Dropout(0.1),          
    
    keras.layers.Dense(1, activation='sigmoid')  
])

model_1_dnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model_1_dnn.fit(train_dataset, epochs=10, callbacks=[keras.callbacks.EarlyStopping(patience=3)])

In [None]:
loss, accuracy = model_1_dnn.evaluate(test_dataset)

In [34]:
print(f'Loss: {loss}\nAccuracy: {accuracy}')

Loss: 0.32168418169021606
Accuracy: 0.906416654586792


In [37]:
model_1_dnn.save('saved_models/model_1_dnn.keras')

**2. A Recurrent Neural Network**<br>
I have used a **GRU** for this task. Although for a simple task like this one, we could have 

In [None]:
vocab_size = 20000
embedding_dim = 128
input_length = 500

model_2_rnn = keras.models.Sequential(
    [
    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
    
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),
    
    keras.layers.GRU(64),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),
    
    keras.layers.Dense(1, activation='sigmoid') 
    ]
)

model_2_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
model_2_rnn.fit(train_dataset, epochs=10, callbacks=[keras.callbacks.EarlyStopping(patience=3)])

Epoch 1/10


2024-07-25 14:29:49.161299: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 40ms/step - accuracy: 0.5472 - loss: 0.7294
Epoch 2/10


2024-07-25 14:30:30.139806: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-07-25 14:30:30.139862: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]
  self.gen.throw(value)
  current = self.get_monitor_value(logs)


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 39ms/step - accuracy: 0.8693 - loss: 0.3202
Epoch 3/10
[1m   1/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:48[0m 168ms/step - accuracy: 0.9375 - loss: 0.1442

2024-07-25 14:31:09.562450: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-07-25 14:31:09.562486: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 40ms/step - accuracy: 0.9295 - loss: 0.1945
Epoch 4/10
[1m   1/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:52[0m 173ms/step - accuracy: 1.0000 - loss: 0.0819

2024-07-25 14:31:49.710020: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-07-25 14:31:49.710060: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 39ms/step - accuracy: 0.9531 - loss: 0.1367
Epoch 5/10
[1m   1/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:12[0m 192ms/step - accuracy: 1.0000 - loss: 0.0553

2024-07-25 14:32:28.945909: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-07-25 14:32:28.945944: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 39ms/step - accuracy: 0.9702 - loss: 0.0953
Epoch 6/10
[1m   1/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:52[0m 173ms/step - accuracy: 1.0000 - loss: 0.0232

2024-07-25 14:33:08.474056: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-07-25 14:33:08.474094: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 39ms/step - accuracy: 0.9811 - loss: 0.0690
Epoch 7/10
[1m   1/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:08[0m 189ms/step - accuracy: 1.0000 - loss: 0.0204

2024-07-25 14:33:47.718411: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-07-25 14:33:47.718458: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 39ms/step - accuracy: 0.9861 - loss: 0.0553
Epoch 8/10
[1m   1/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:15[0m 195ms/step - accuracy: 0.9375 - loss: 0.1447

2024-07-25 14:34:26.744752: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-07-25 14:34:26.744792: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 40ms/step - accuracy: 0.9862 - loss: 0.0444
Epoch 9/10


2024-07-25 14:35:06.840245: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-07-25 14:35:06.840294: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]


[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 40ms/step - accuracy: 0.9917 - loss: 0.0352
Epoch 10/10
[1m   1/1000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:03[0m 184ms/step - accuracy: 1.0000 - loss: 0.0092

2024-07-25 14:35:46.879759: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-07-25 14:35:46.879798: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]


[1m 304/1000[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m28s[0m 40ms/step - accuracy: 0.9904 - loss: 0.0401