1. mount, install, and import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install -U "tensorflow-text==2.13.*"
!pip install "tf-models-official==2.13.*"
!pip install transformers

import os
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import layers
from transformers import BertTokenizer

print(f"TensorFlow version: {tf.__version__}")
print(f"TensorFlow Hub version: {hub.__version__}")

2. load tokenizer

In [None]:
# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

2. preprocess data

In [None]:
# Converts the Action column to binary format where Recruiting is 1, and other actions are 0.

def preprocess_data(df):
    df['Action'] = (df['Action'] == 'Recruiting').astype(int)
    return df

# Encodes the text data into token IDs, attention masks, and token type IDs using the BERT tokenizer.
# This step converts raw text into numerical format suitable for input to a BERT model.
def encode(texts, tokenizer, max_length=128):
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors='tf'
    )
    return encodings['input_ids'], encodings['attention_mask'], encodings['token_type_ids']

# Converts the preprocessed DataFrame into a TensorFlow Dataset.
# This dataset includes tokenized inputs and labels, ready for model training.

def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('Action')
    texts = dataframe.pop('Message')
    input_ids, attention_masks, token_type_ids = encode(texts, tokenizer)

    ds = tf.data.Dataset.from_tensor_slices((
        {
            "input_word_ids": input_ids,
            "input_mask": attention_masks,
            "input_type_ids": token_type_ids  # Changed from 'segment_ids' to 'input_type_ids'
        },
        labels.values
    ))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds


# Load and preprocess data
data = pd.read_csv('/content/drive/My Drive/data.csv')
data = preprocess_data(data)  # Convert 'Action' to binary

train_data, temp_data = train_test_split(data, test_size=0.30, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.50, random_state=42)

# Save datasets to drive
train_data.to_csv('/content/drive/My Drive/train_data.csv', index=False)
val_data.to_csv('/content/drive/My Drive/val_data.csv', index=False)
test_data.to_csv('/content/drive/My Drive/test_data.csv', index=False)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 6062
Validation set size: 1299
Test set size: 1300


4. load dataset and prep training

In [None]:
# Load datasets from disk
train_data = pd.read_csv('/content/drive/My Drive/train_data.csv')
val_data = pd.read_csv('/content/drive/My Drive/val_data.csv')
test_data = pd.read_csv('/content/drive/My Drive/test_data.csv')

batch_size = 32
AUTOTUNE = tf.data.AUTOTUNE

train_ds = df_to_dataset(train_data, batch_size=batch_size)
val_ds = df_to_dataset(val_data, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test_data, shuffle=False, batch_size=batch_size)

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

4. create and compile model




In [None]:
def create_model():
    input_word_ids = layers.Input(shape=(128,), dtype=tf.int32, name='input_word_ids') # token IDs generated by the BERT tokenizer, representing the tokenized words in each input sentence.
    input_mask = layers.Input(shape=(128,), dtype=tf.int32, name='input_mask') # attention masks that help BERT focus on actual tokens while ignoring padding.
    input_type_ids = layers.Input(shape=(128,), dtype=tf.int32, name='input_type_ids') # segment IDs that distinguish between different sentences within a single input.

    # Correct the input format to a dictionary
    bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/2", trainable=True)
    bert_inputs = {
        'input_word_ids': input_word_ids,
        'input_mask': input_mask,
        'input_type_ids': input_type_ids
    }

    # bert_layer returns a dictionary of output tensors
    bert_outputs = bert_layer(bert_inputs)
    pooled_output = bert_outputs['pooled_output']  # Extract pooled_output
    output = layers.Dense(1, activation='sigmoid')(pooled_output) # sigmoid activation is a mathematical function commonly used in neural networks for binary classification problems

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=output)
    return model

model = create_model()

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

5. Define callbacks and train model

In [None]:
checkpoint_cb = ModelCheckpoint(
    '/content/drive/My Drive/checkpoints/model_checkpoint.h5',
    save_best_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1
)

early_stopping_cb = EarlyStopping(
    monitor='val_loss',
    patience=3,
    verbose=1,
    restore_best_weights=True
)

checkpoint_path = '/content/drive/My Drive/checkpoints/model_checkpoint.h5'
if os.path.exists(checkpoint_path):
    model.load_weights(checkpoint_path)
    print("Loaded model from checkpoint.")

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=10,
    callbacks=[checkpoint_cb, early_stopping_cb]
)

6. Save and evaluate the model

In [None]:
# Save final model
model.save('/content/drive/My Drive/final_model.h5')

# Evaluate the model on the test set
loss, accuracy = model.evaluate(test_ds)
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")