In [39]:
'''
The results may or may not be reproducible due to dataset shuffling operations,
and also because my sole focus was on creating a visibly beautiful notebook, 
and training ONE good model for submission, for the sake of the assignment requirements.
'''

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import os
import warnings
from tensorflow.python.util import deprecation
from transformers import AutoTokenizer, TFBertForSequenceClassification

In [26]:
tf.get_logger().setLevel('ERROR')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=UserWarning, module='tensorflow')
warnings.filterwarnings('ignore', category=UserWarning, module='tensorflow.core.framework.local_rendezvous')
deprecation._PRINT_DEPRECATION_WARNINGS = False

In [3]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### Loading and Preprocessing Data for Custom Models

The following functions read and preprocess the CSV data on file. The `parse_csv_line()` function works on each line and separates the review and the label while ensuring consistency. The `preprocess()` function also works on each element of the dataset iteratively and vectorizes the review text using an adapted vectorizer. In the `prepare_dataset()` function, we combine both of these functionalities, plus adapt our vectorizer on a subset of the data, and finally return a shuffled, batched, and prefetched dataset along with the adapted vectorizer. Lastly, we save both of these for later use. While the functions look clean and simple, a lot of experimentation went under the hoods which I have cleaned up.

In [36]:
def parse_csv_line(line):
    parts = tf.strings.split(line, sep=',')
    label = tf.strings.to_number(parts[-1], out_type=tf.int32)
    review = tf.strings.reduce_join(parts[:-1], separator=',')
    
    return review, label

# old
def preprocess(text, label, vectorizer):
    # text = tf.expand_dims(text, axis=-1)  # Add an extra dimension for the vectorizer
    text = vectorizer(text)
    return text, label

In [None]:
# Legacy
def prepare_dataset_from_csv(filename, header=True, sample_size=5000, max_tokens=20000, output_sequence_length=500, batch_size=16):
    
    dataset = tf.data.TextLineDataset([filename], num_parallel_reads=tf.data.AUTOTUNE)
        
    # Skip header if present
    if header:
        dataset = dataset.skip(1)
    
    dataset = dataset.map(parse_csv_line, num_parallel_calls=tf.data.AUTOTUNE)
    
    # Create and adapt the TextVectorization layer
    vectorizer = keras.layers.TextVectorization(max_tokens=max_tokens, output_sequence_length=output_sequence_length)
    vectorizer.adapt(dataset.take(sample_size).map(lambda x, y: x))


    # Preprocess text
    dataset = dataset.map(lambda text, label: preprocess(text, label, vectorizer), num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.shuffle(buffer_size=10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    
    return dataset, vectorizer

file_path = 'data/movie.csv'
dataset, vectorizer = prepare_dataset_from_csv(file_path, output_sequence_length=512)

In [38]:
for i in dataset.take(2).as_numpy_iterator():
    print(i[0].shape, i[1].shape)

(16, 512) (16,)
(16, 512) (16,)


#### Saving the Dataset
I am saving the preprocessed dataset as a TFRecords file.

In [167]:
def _create_feature(value, _type):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
        
    if _type=='label':
        value = [value]   # Because Int64List accepts a list, but label is a scalar
    elif _type=='review':
        value = value.tolist()
        
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _serialize_example(review, label):
    
    feature = {
        'review': _create_feature(review, _type='review'),
        'label': _create_feature(label, _type='label')
    }
    
    example_review = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_review.SerializeToString()

def save_to_tfr(dataset, out_file='data/dataset_prepared.tfrecord'):
    with tf.io.TFRecordWriter(out_file) as writer:
        for batch in dataset:
            reviews, labels = batch
            for review, label in zip(reviews, labels):
                serialized_example = _serialize_example(review.numpy(), label.numpy())
                writer.write(serialized_example)

In [None]:
save_to_tfr(dataset)

The dataset has been saved and can be loaded as follows. Since after storing as TFR, all the in-memory transformations like shuffling and batching are lost, we need to re-apply those.

In [3]:
feature_description = {
    'review': tf.io.FixedLenFeature([500], tf.int64),
    'label': tf.io.FixedLenFeature([], tf.int64)
}

In [4]:
def prepare_dataset_from_tfr(filename, feature_description=feature_description, batch_size=16, shuffle_buffer_size=10000):
    raw_dataset = tf.data.TFRecordDataset([filename])
    
    def _parse_examples(example_review):      # To parse examples
        return tf.io.parse_single_example(example_review, feature_description)
    
    dataset = raw_dataset.map(_parse_examples)
    dataset = dataset.map(lambda example_dict: (example_dict['review'], example_dict['label']))
    dataset = dataset.shuffle(shuffle_buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [6]:
dataset = prepare_dataset_from_tfr('data/dataset_prepared.tfrecord')
for i in dataset.take(1):
    print(i)

(<tf.Tensor: shape=(16, 500), dtype=int64, numpy=
array([[  11,   62,   77, ...,    0,    0,    0],
       [   8,   32,   55, ...,  179,  393,   65],
       [ 102,    3,  552, ...,    0,    0,    0],
       ...,
       [   1,   98,  222, ...,    0,    0,    0],
       [ 212,    9,  648, ..., 2954,  253,   22],
       [  30,    2,  118, ...,    0,    0,    0]])>, <tf.Tensor: shape=(16,), dtype=int64, numpy=array([0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1])>)


2024-07-27 08:30:50.773638: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


### Model Training
All the preprocessing steps have been done and the dataset is now ready to be trained on. We shall use 4 different models for this purpose:
1. A simple Keras custom DNN with Batch Normalization, Dropout, and other advanced techniques as needed,
2. A simple RNN,
3. An LSTM or GRU, and
4. Transfer Learning using a HuggingFace Transformers with pretrained tokenizers and embeddings.

**1. Custom Deep Neural Network**

In [35]:
BATCHES = 40000 // 16  
TRAIN_SIZE = int(BATCHES * 0.8)
VAL_SIZE = int(BATCHES * 0.1)

train_dataset = dataset.take(TRAIN_SIZE)
val_dataset = dataset.skip(TRAIN_SIZE).take(VAL_SIZE)
test_dataset = dataset.skip(TRAIN_SIZE + VAL_SIZE)


In [21]:
for i in train_dataset.take(1):
    print(i[0].shape, i[1].shape)

(16, 500) (16,)


In [23]:
vocab_size = 20000  
embedding_dim = 128  
input_length = 500  

model_1_dnn = keras.models.Sequential([
    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
    keras.layers.Flatten(),
    
    keras.layers.Dense(128, activation='relu'),
    keras.layers.BatchNormalization(),  
    keras.layers.Dropout(0.1),          
    
    keras.layers.Dense(64, activation='relu'),
    keras.layers.BatchNormalization(),  
    keras.layers.Dropout(0.1),          
    
    keras.layers.Dense(1, activation='sigmoid')  
])

model_1_dnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model_1_dnn.fit(train_dataset, validation_data=val_dataset, epochs=10, callbacks=[keras.callbacks.EarlyStopping(patience=3)])

In [None]:
loss, accuracy = model_1_dnn.evaluate(test_dataset)

In [34]:
print(f'Loss: {loss}\nAccuracy: {accuracy}')

Loss: 0.32168418169021606
Accuracy: 0.906416654586792


In [37]:
model_1_dnn.save('saved_models/model_1_dnn.keras')

**2. A Simple Recurrent Neural Network**

In [None]:
model_2_rnn = keras.models.Sequential([
    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
    
    keras.layers.SimpleRNN(128, return_sequences=True), 
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.1),
    
    keras.layers.SimpleRNN(64),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.1),
    
    keras.layers.Dense(1, activation='sigmoid')
])

model_2_rnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Run with caution if using a CPU, took 10+ min on my 4070Ti for some reason
model_2_rnn.fit(train_dataset, validation_data=val_dataset, epochs=10, callbacks=[keras.callbacks.EarlyStopping(patience=3)])

In [None]:
loss_rnn, accuracy_rnn = model_2_rnn.evaluate(test_dataset)

In [38]:
print(f'RNN Loss: {loss_rnn}\nRNN Accuracy: {accuracy_rnn}')

RNN Loss: 0.10939233333333
RNN Accuracy: 0.94426852180481


**3. An RNN with Gated Recurrent Units**

In [None]:
model_3_gru = keras.models.Sequential(
    [
    keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),
    
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),
    
    keras.layers.GRU(64),
    keras.layers.BatchNormalization(),
    keras.layers.Dropout(0.2),
    
    keras.layers.Dense(1, activation='sigmoid') 
    ]
)

model_3_gru.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [41]:
model_3_gru = keras.models.load_model('saved_models/model_3_gru.keras')

In [None]:
model_3_gru.fit(train_dataset, epochs=4, validation_data=val_dataset, callbacks=[keras.callbacks.EarlyStopping(patience=2)])

In [None]:
loss_gru, accuracy_gru = model_3_gru.evaluate(test_dataset)

In [44]:
print(f'GRU Loss: {loss_gru}\nGRU Accuracy: {accuracy_gru}')

GRU Loss: 0.02748866192996502
GRU Accuracy: 0.9950000047683716


**4. Transfer Learning using GPT**

Now, we need new preprocessing steps because GPT has its own Embedding layers and Tokenizers.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model_4_bert = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

I initially tried using `tf.data.TextLineDataset()` to read the file, but the tokenizer accepts strings as input and `.map()` on the `TextLineDataset` uses `SymbolicTensors` in some intermediate steps, leading to errors. For the time being, I am using Pandas to read the file and tokenize it outside TensorFlow. Although this could be terribly slow, the solution works just fine. This will be replaced with performance optimized TensorFlow-only code as soon as I find a fix.

In [3]:
def prepare_data(filename, tokenizer):
    df = pd.read_csv(filename)
    
    encodings = tokenizer(
        df['text'].tolist(),
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='tf'
    )
    
    input_ids = encodings['input_ids']
    attention_masks = encodings['attention_mask']
    labels = tf.convert_to_tensor(df['label'].tolist())
    
    dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': input_ids, 'attention_mask': attention_masks}, labels))
    dataset = dataset.batch(8)
    
    return dataset

filename = 'data/movie.csv'
dataset = prepare_data(filename, tokenizer)

In [4]:
BATCHES = 40000 // 8  
TRAIN_SIZE = int(BATCHES * 0.8)
VAL_SIZE = int(BATCHES * 0.1)

train_dataset = dataset.take(TRAIN_SIZE)
val_dataset = dataset.skip(TRAIN_SIZE).take(VAL_SIZE)
test_dataset = dataset.skip(TRAIN_SIZE + VAL_SIZE)

In [5]:
# Freeze some bottom layers to use pretrained weights
def freeze_bottom_layers(model, num_layers_to_freeze):
    encoder = model.bert.encoder

    for layer in encoder.layer[:num_layers_to_freeze]:
        layer.trainable = False

freeze_bottom_layers(model_4_bert, num_layers_to_freeze=8)

for i, layer in enumerate(model_4_bert.bert.encoder.layer):
    print(f'Layer {i} trainable: {layer.trainable}')

Layer 0 trainable: False
Layer 1 trainable: False
Layer 2 trainable: False
Layer 3 trainable: False
Layer 4 trainable: False
Layer 5 trainable: False
Layer 6 trainable: False
Layer 7 trainable: False
Layer 8 trainable: True
Layer 9 trainable: True
Layer 10 trainable: True
Layer 11 trainable: True


In [14]:
optimizer = tf.keras.optimizers.Adam(learning_rate=7e-6)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy')]
    
model_4_bert.compile(optimizer=optimizer, loss=loss, metrics=metrics)

In [16]:
model_4_bert.fit(train_dataset, validation_data=val_dataset, epochs=1)



<tf_keras.src.callbacks.History at 0x7fde06efd5b0>

In [17]:
loss_bert, accuracy_bert = model_4_bert.evaluate(test_dataset)



In [18]:
print(f'BERT Loss: {loss_bert}\nBERT Accuracy: {accuracy_bert}')

BERT Loss: 0.23736098408699036
BERT Accuracy: 0.934499979019165


In [19]:
model_4_bert.save_weights('saved_models/model_4_bert_weights.h5')

### Inference
This is a very simple demonstration of predictions using this model. Please follow instructions in the `README.md` for complete guide.

In [61]:
def prepare_data_for_prediction(input_data, tokenizer):
    
    if isinstance(input_data, list):
        texts = input_data
    elif is_file(input_data):
        if input_data.endswith('.csv'):
            df = pd.read_csv(input_data)
            texts = df['text'].tolist()
        elif input_data.endswith('.txt'):
            with open(input_data, 'r') as file:
                texts = file.readlines()
                texts = [line.strip() for line in texts]
        else:
            raise ValueError("Unsupported file format. Please provide a .csv or .txt file.")
    
    elif isinstance(input_data, str):
        texts = [input_data]
    else:
        raise ValueError("Unsupported input type. Please provide a file path, a single string, or a list of strings.")

    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='tf'
    )

    input_ids = encodings['input_ids']
    attention_masks = encodings['attention_mask']
    return {'input_ids': input_ids, 'attention_mask': attention_masks}

def predict(model, preprocessed_data):
    inputs = {k: v for k, v in preprocessed_data.items()}
    outputs = model(inputs)
    logits = outputs.logits
    predictions = tf.argmax(logits, axis=-1).numpy()
    return predictions
def is_file(input_string):
    return os.path.isfile(input_string)

In [62]:
prepped = prepare_data_for_prediction(input_data='data/test.txt',
                                      tokenizer=tokenizer)

In [63]:
predict(model_4_bert, prepped)

array([1, 1, 0])