# Raw Data & Import

In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
import tensorflow as tf
from dotenv import load_dotenv
load_dotenv()
project_root = os.path.dirname(os.getcwd())

In [11]:
# Load data from pickle
path = os.path.join(project_root, "datasets/processed/data.pkl")
data = pd.read_pickle(path)
data['label'] = data['label'].apply(lambda x: 1 if x == 2 else x)
data.shape

(117114, 11)

In [10]:
#from utils.gcp import load_data_from_gcs
#from google.auth import credentials
#from google.cloud import storage
#service_account = os.path.join(project_root, os.getenv("GCP_SERVICE_ACCOUNT"))
#client = storage.Client.from_service_account_json(service_account)
#
#
## Load data from GCS
#bucket_name = os.getenv("GCP_BUCKET_NAME")
#file_name = os.getenv("GCP_DATA_PATH")
#data = load_data_from_gcs(bucket_name, file_name, client)

In [5]:
data.head()

Unnamed: 0,source,text,label,id,corrected_text,tokens
0,toxic_comment,| This was me and I haven't edited wikipedia i...,0,59acfdecb57c450ea3c2c1cd8f00af90,this was me and i have not edited wikipedia in...,"[this, be, me, and, i, have, not, edit, wikipe..."
1,hate_speech,"@chvrlesGoldie like Hov said ""we all ghetto b""...",1,7f683aaf0fab427c84fcb4d224e76667,person like how said we all hetty b,"[person, like, how, say, we, all, hetty, b]"
2,toxic_comment,POSTSCRIPT: And this article is still really p...,1,d6c357344a0145218dc68eecd3ea69bc,postscript and this article is still really pa...,"[postscript, and, this, article, be, still, re..."
3,toxic_comment,"The current infobox of this section, wich repr...",0,db23ee8f56ac475f945bb645861074f4,the current infobox of this section with repre...,"[the, current, <UNK>, of, this, section, with,..."
4,hate_speech,"I wasn't born lastnight , I know theses hoes a...",1,95e66b8a95e54ccaa2c259d3abdcc864,i was not born lastnight i know these hoes is ...,"[i, be, not, bear, <UNK>, i, know, these, hoe,..."


# Training setup

In [12]:
from utils.custom_metrics import RecallMulticlass, PrecisionMulticlass, F1ScoreMulticlass, WeightedCategoricalCrossEntropy

# metrics
metrics = [RecallMulticlass(name="recall", n_class=2), 
           PrecisionMulticlass(name="precision", n_class=2), 
           F1ScoreMulticlass(name="f1", n_class=2)]

# weights
weights = data["label"].value_counts(normalize=True).sort_index().values
weights = 1/weights
weights = weights/weights.sum()

# loss
loss = WeightedCategoricalCrossEntropy(weights)

2025-01-24 18:28:15.795338: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2025-01-24 18:28:15.795472: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-01-24 18:28:15.796211: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-01-24 18:28:15.796872: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-01-24 18:28:15.797467: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


# BERT

In [13]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import pandas as pd
import tensorflow as tf


X, y = data["corrected_text"], data["label"]
y = tf.keras.utils.to_categorical(y, num_classes=2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Tokenize
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the data
def encode_data(texts, max_length=128):
    return tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )

# Encode the training data
encoded_data = encode_data(X_train)
encoded_val_data = encode_data(X_test)

# Convert labels to tensor
labels = tf.convert_to_tensor(y_train)
val_labels = tf.convert_to_tensor(y_test)

  from .autonotebook import tqdm as notebook_tqdm


(105402,) (11712,) (105402, 2) (11712, 2)


In [14]:
# Investigate class distribution in y_train and y_test
print(f"y_train class distribution: {tf.divide(tf.reduce_sum(labels, axis=0), tf.reduce_sum(labels))}")
print(f"y_test class distribution: {tf.divide(tf.reduce_sum(val_labels, axis=0), tf.reduce_sum(val_labels))}")

y_train class distribution: [0.6660215  0.33397847]
y_test class distribution: [0.66282445 0.33717555]


In [15]:
# Create a TensorFlow dataset
training_dataset = tf.data.Dataset.from_tensor_slices((
    dict(encoded_data),
    labels
))

validation_dataset = tf.data.Dataset.from_tensor_slices((
    dict(encoded_val_data),
    val_labels
))

# Batch the dataset
batch_size = 128
training_dataset = training_dataset.batch(batch_size)
validation_dataset = validation_dataset.batch(batch_size)

In [16]:
import os
import tensorflow as tf

# Early stopping

def early_stopping():
    return tf.keras.callbacks.EarlyStopping(
        monitor='val_recall',     
        patience=10,             
        mode='max',            
        min_delta=0.001,        
        restore_best_weights=True
    )

# TensorBoard
#def tensorboard(log_dir:str = os.path.join(project_root, "logs", "fit")):
#    return tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# ModelCheckpoint
def model_checkpoint(model_name):
    return tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(project_root, "models", "bert", f"{model_name}"),
        monitor='val_recall',
        save_best_only=True,
        save_weights_only=True,
        save_format='tf',
        mode='max',
        verbose=1
    )

In [17]:
callbacks = [early_stopping(), model_checkpoint("bert_model_test")]

In [19]:
from transformers import TFBertModel, BertTokenizer
import tensorflow as tf


def build_bert_model(loss: list, metrics: list, name:str = "bert_model"):
    # Load the pre-trained BERT model
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')

    # Freeze the BERT model layers
    for layer in bert_model.layers:  # Freeze all layers
        layer.trainable = False

    # Define the input layers
    input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask")

    # Get the output from the BERT model
    bert_outputs = bert_model(input_ids, attention_mask=attention_mask)

    # Use the pooled output for classification
    pooled_output = bert_outputs.pooler_output

    # Add custom layers
    x = tf.keras.layers.Dense(128, activation='relu')(pooled_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    output = tf.keras.layers.Dense(2, activation='softmax')(x)

    # Create the model
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output, name=name)

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.legacy.Adam(),
                  loss=loss,
                  metrics=metrics)

    # Summary of the model
    model.summary()
    
    return model

model = build_bert_model(loss, metrics, "bert_model")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "bert_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model_1 (TFBertMod  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 el)                         ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, 128, 7                                  

In [None]:
model_history = model.fit(training_dataset, epochs=1, callbacks=callbacks, validation_data=validation_dataset)

Epoch 1: val_recall improved from -inf to 0.51188, saving model to /Users/theopinto--dalle/code/arewetoxic/models/bert/bert_model_test


In [36]:
# Save the model
model.save_weights(os.path.join(project_root, "models", "bert_dummy.h5"))