In [None]:
import tensorflow as tf
from tensorflow import keras

In [5]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### Loading and Preprocessing Data for Custom Models

The following functions read and preprocess the CSV data on file. The `parse_csv_line()` function works on each line and separates the review and the label while ensuring consistency. The `preprocess()` function also works on each element of the dataset iteratively and vectorizes the review text using an adapted vectorizer. In the `prepare_dataset()` function, we combine both of these functionalities, plus adapt our vectorizer on a subset of the data, and finally return a shuffled, batched, and prefetched dataset along with the adapted vectorizer. Lastly, we save both of these for later use. While the functions look clean and simple, a lot of experimentation went behind the hoods which I have cleaned up.

In [None]:
def parse_csv_line(line):
    parts = tf.strings.split(line, sep=',')
    label = tf.strings.to_number(parts[-1], out_type=tf.int32)
    review = tf.strings.reduce_join(parts[:-1], separator=',')
    return review, label

def preprocess(text, label, vectorizer):
    text = tf.expand_dims(text, axis=-1)  # Add an extra dimension for the vectorizer
    text = vectorizer(text)
    return text, label

In [83]:
def prepare_dataset(filename, header=True, max_tokens=20000, output_sequence_length=160, batch_size=32):
    dataset = tf.data.TextLineDataset([filename], num_parallel_reads=tf.data.AUTOTUNE)
    
    # Skip header if present
    if header:
        dataset = dataset.skip(1)
    
    dataset = dataset.map(parse_csv_line, num_parallel_calls=tf.data.AUTOTUNE)
    
    # Create and adapt the TextVectorization layer
    vectorizer = keras.layers.TextVectorization(max_tokens=max_tokens, output_sequence_length=output_sequence_length)
    vectorizer.adapt(dataset.take(100).map(lambda x, y: x))

    # Preprocess text
    dataset = dataset.map(lambda text, label: preprocess(text, label, vectorizer), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return dataset, vectorizer

In [None]:
file_path = 'data/movie.csv'
dataset, vectorizer = prepare_dataset(file_path)

#### Saving the Dataset and the Vectorizer
I am saving the preprocessed dataset as a TFRecords file, and the vectorizer using the `save()` method directly.

def 