In [20]:
import tensorflow as tf
from tensorflow import keras
import os
import warnings
from tensorflow.python.util import deprecation

In [26]:
tf.get_logger().setLevel('ERROR')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning, module='tensorflow')
warnings.filterwarnings('ignore', category=UserWarning, module='tensorflow.core.framework.local_rendezvous')
deprecation._PRINT_DEPRECATION_WARNINGS = False

In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### Loading and Preprocessing Data for Custom Models

The following functions read and preprocess the CSV data on file. The `parse_csv_line()` function works on each line and separates the review and the label while ensuring consistency. The `preprocess()` function also works on each element of the dataset iteratively and vectorizes the review text using an adapted vectorizer. In the `prepare_dataset()` function, we combine both of these functionalities, plus adapt our vectorizer on a subset of the data, and finally return a shuffled, batched, and prefetched dataset along with the adapted vectorizer. Lastly, we save both of these for later use. While the functions look clean and simple, a lot of experimentation went under the hoods which I have cleaned up.

In [162]:
def parse_csv_line(line):
    parts = tf.strings.split(line, sep=',')
    label = tf.strings.to_number(parts[-1], out_type=tf.int32)
    review = tf.strings.reduce_join(parts[:-1], separator=',')
    return review, label

def preprocess(text, label, vectorizer):
    # text = tf.expand_dims(text, axis=-1)  # Add an extra dimension for the vectorizer
    text = vectorizer(text)
    return text, label

In [163]:
def prepare_dataset_from_csv(filename, header=True, sample_size=5000, max_tokens=20000, output_sequence_length=500, batch_size=32):
    
    dataset = tf.data.TextLineDataset([filename], num_parallel_reads=tf.data.AUTOTUNE)
        
    # Skip header if present
    if header:
        dataset = dataset.skip(1)
    
    dataset = dataset.map(parse_csv_line, num_parallel_calls=tf.data.AUTOTUNE)
    
    # Create and adapt the TextVectorization layer
    vectorizer = keras.layers.TextVectorization(max_tokens=max_tokens, output_sequence_length=output_sequence_length)
    vectorizer.adapt(dataset.take(sample_size).map(lambda x, y: x))

    # Preprocess text
    dataset = dataset.map(lambda text, label: preprocess(text, label, vectorizer), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return dataset, vectorizer

In [164]:
file_path = 'data/movie.csv'
dataset, vectorizer = prepare_dataset_from_csv(file_path)

2024-07-24 21:30:44.372711: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [165]:
for i in dataset.take(2).as_numpy_iterator():
    print(i[0].shape, i[1].shape)


(32, 500) (32,)
(32, 500) (32,)


2024-07-24 21:30:44.808068: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


#### Saving the Dataset
I am saving the preprocessed dataset as a TFRecords file.

In [167]:
def _create_feature(value, _type):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
        
    if _type=='label':
        value = [value]   # Because Int64List accepts a list, but label is a scalar
    elif _type=='review':
        value = value.tolist()
        
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _serialize_example(review, label):
    
    feature = {
        'review': _create_feature(review, _type='review'),
        'label': _create_feature(label, _type='label')
    }
    
    example_review = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_review.SerializeToString()

def save_to_tfr(dataset, out_file='data/dataset_prepared.tfrecord'):
    with tf.io.TFRecordWriter(out_file) as writer:
        for batch in dataset:
            reviews, labels = batch
            for review, label in zip(reviews, labels):
                serialized_example = _serialize_example(review.numpy(), label.numpy())
                writer.write(serialized_example)

In [168]:
save_to_tfr(dataset)

2024-07-24 21:31:14.474807: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


The dataset has been saved and can be loaded as follows. Since after storing as TFR, all the in-memory transformations like shuffling and batching are lost, we need to re-apply those.

In [3]:
feature_description = {
    'review': tf.io.FixedLenFeature([500], tf.int64),
    'label': tf.io.FixedLenFeature([], tf.int64)
}

In [4]:
def prepare_dataset_from_tfr(filename, feature_description=feature_description, batch_size=16, shuffle_buffer_size=10000):
    raw_dataset = tf.data.TFRecordDataset([filename])
    
    def _parse_examples(example_review):      # To parse examples
        return tf.io.parse_single_example(example_review, feature_description)
    
    dataset = raw_dataset.map(_parse_examples)
    dataset = dataset.map(lambda example_dict: (example_dict['review'], example_dict['label']))
    dataset = dataset.shuffle(shuffle_buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [6]:
dataset = prepare_dataset_from_tfr('data/dataset_prepared.tfrecord')
for i in dataset.take(1):
    print(i)

(<tf.Tensor: shape=(16, 500), dtype=int64, numpy=
array([[8838,   98,   26, ...,    0,    0,    0],
       [  41,  209,    2, ...,    0,    0,    0],
       [  10,    7,   29, ...,    0,    0,    0],
       ...,
       [3065, 1652,   99, ...,    0,    0,    0],
       [8735, 9327, 1022, ...,    0,    0,    0],
       [4933, 2709,    5, ...,    0,    0,    0]])>, <tf.Tensor: shape=(16,), dtype=int64, numpy=array([0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1])>)


2024-07-24 21:47:42.737563: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


All the preprocessing steps have been done and the dataset is now ready to be trained on. We shall use 4 different models for this purpose:
1. A simple Keras custom DNN with Batch Normalization, Dropout, and other advanced techniques as needed,
2. A simple RNN,
3. An LSTM or GRU, and
4. Transfer Learning using GPT from HuggingFace Transformers using pretrained tokenizers and embeddings.

**1. Custom Deep Neural Network**

In [14]:
BATCHES = 40000/32
TRAIN_SIZE = int(BATCHES*0.8)
train_dataset = dataset.take(TRAIN_SIZE)
test_dataset = dataset.skip(TRAIN_SIZE)

In [24]:
vocab_size = 20000  
embedding_dim = 128  
input_length = 500  

model_1_dnn = keras.models.Sequential([
    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
    keras.layers.Flatten(),
    
    keras.layers.Dense(128, activation='relu'),
    keras.layers.BatchNormalization(),  
    keras.layers.Dropout(0.1),          
    
    keras.layers.Dense(64, activation='relu'),
    keras.layers.BatchNormalization(),  
    keras.layers.Dropout(0.1),          
    
    keras.layers.Dense(1, activation='sigmoid')  
])

model_1_dnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model_1_dnn.fit(train_dataset, epochs=10, callbacks=[keras.callbacks.EarlyStopping(patience=3)])

In [None]:
loss, accuracy = model_1_dnn.evaluate(test_dataset)

In [34]:
print(f'Loss: {loss}\nAccuracy: {accuracy}')

Loss: 0.32168418169021606
Accuracy: 0.906416654586792
