# Data Import

In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import pandas as pd
import tensorflow as tf
from dotenv import load_dotenv
load_dotenv()
project_root = os.path.dirname(os.getcwd())

In [None]:
from utils.gcp import load_data_from_gcs
from google.auth import credentials
from google.cloud import storag
service_account = os.path.join(project_root, os.getenv("GCP_SERVICE_ACCOUNT"))
client = storage.Client.from_service_account_json(service_account)

# Load data from GCS
bucket_name = os.getenv("GCP_BUCKET_NAME")
file_name = os.getenv("GCP_DATA_PATH")
data = load_data_from_gcs(bucket_name, file_name, client)

data["tokens"] = data["tokens"].apply(lambda x: eval(x))
data["label"] = data["label"].astype(int)
# Shuffle data and reset_index
data = data.sample(frac=1).reset_index(drop=True)

In [2]:
data.head()

Unnamed: 0,source,text,label,id,corrected_text,tokens
0,hate_speech,Crying to the G20 that some Americans don't li...,1,fef26687bd954e0ba7f7b4448aabfaab,crying to the g that some americans do not lik...,"[cry, to, the, g, that, some, american, do, no..."
1,hate_speech,RT @xDSmooth: I took yo job bitch @1BookieG,1,e35367e0277343fa8f60cbef166dbc0e,i took hello job bitch person,"[i, take, hello, job, bitch, person]"
2,toxic_comment,"""\n\n Go fuck yourself!! \n\n fuck you """,2,c1c176d2b62246fda473d0280629fab6,go fuck yourself fuck you,"[go, fuck, yourself, fuck, you]"
3,toxic_comment,I fix articles on notable subjects frequently ...,0,15b181935a9e4ba3aeb46afe58dec582,i fix articles on notable subjects frequently ...,"[i, fix, article, on, notable, subject, freque..."
4,toxic_comment,"FUCK YOU, YOU ATHEIST CUNT! FUCK YOU, YOU ATHE...",2,046ec2535ad246bdb11d0671fbcf6ecf,fuck atheist count fuck atheist count fuck ath...,"[fuck, atheist, count, fuck, atheist, count, f..."


# Load training data

In [3]:
# Load data from pickle
path = os.path.join(project_root, "datasets/processed/data.pkl")
data = pd.read_pickle(path)
data['label'] = data['label'].apply(lambda x: 1 if x == 2 else x)
data.head()

Unnamed: 0,source,text,label,id,corrected_text,tokens,tokens_length,unique_tokens,log_tokens_length,unique_ratio,log_unique_ratio
0,toxic_comment,"""I defy you ==\nTo find anything that shows I'...",0,fca5d312945c4365a4768bbbcb2d8d9f,i defy you,"[i, defy, you]",3,3,1.386294,1.0,1.0
1,toxic_comment,What a lovely surprise! \n\n ]]\nHello Phaedri...,0,9d6d04e4216a4bd98b040dd98ead6656,what a lovely surprise hello phaedriel it cert...,"[what, a, lovely, surprise, hello, <UNK>, it, ...",53,39,3.988984,0.735849,0.735849
2,toxic_comment,I still think you are a tit.,1,111c766778f443bf88582087c7f5f2e3,i still think you are a tit,"[i, still, think, you, be, a, tit]",7,7,2.079442,1.0,1.0
3,toxic_comment,"""== 8 Focused vs. 8 Flexible (8B) ==\n\nI adde...",0,f449ff0dfff14526bebac6360ce205c0,i added a situation needed to the assertion th...,"[i, add, a, situation, need, to, the, assertio...",59,44,4.094345,0.745763,0.745763
4,toxic_comment,I'm gonna revert because I found all that info...,0,b841045744b646af9298d16f44684eb6,i am going to revert because i found all that ...,"[i, be, go, to, revert, because, i, find, all,...",44,37,3.806662,0.840909,0.840909


# Training setup

In [4]:
from utils.custom_metrics import RecallMulticlass, PrecisionMulticlass, F1ScoreMulticlass, WeightedCategoricalCrossEntropy

# metrics
metrics = [RecallMulticlass(name="recall", n_class=2), PrecisionMulticlass(name="precision", n_class=2), F1ScoreMulticlass(name="f1", n_class=2)]

# weights
weights = data["label"].value_counts(normalize=True).sort_index().values
weights = 1/weights
weights = weights/weights.sum()

# loss
loss = WeightedCategoricalCrossEntropy(weights)

2024-12-02 18:27:50.926938: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-12-02 18:27:50.927014: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-12-02 18:27:50.927033: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-12-02 18:27:50.927308: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-12-02 18:27:50.927805: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


# BERT

In [6]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
import pandas as pd
import tensorflow as tf


X, y = data["corrected_text"], data["label"]
y = tf.keras.utils.to_categorical(y, num_classes=2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# Tokenize
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the data
def encode_data(texts, max_length=128):
    return tokenizer(
        texts.tolist(),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )

# Encode the training data
encoded_data = encode_data(X_train)
encoded_val_data = encode_data(X_test)

# Convert labels to tensor
labels = tf.convert_to_tensor(y_train)
val_labels = tf.convert_to_tensor(y_test)

(105402,) (11712,) (105402, 2) (11712, 2)


In [10]:
# Investigate class distribution in y_train and y_test

print(f"y_train class distribution: {tf.divide(tf.reduce_sum(labels, axis=0), tf.reduce_sum(labels))}")
print(f"y_test class distribution: {tf.divide(tf.reduce_sum(val_labels, axis=0), tf.reduce_sum(val_labels))}")

y_train class distribution: [0.6660215  0.33397847]
y_test class distribution: [0.66282445 0.33717555]


In [11]:
# Create a TensorFlow dataset
training_dataset = tf.data.Dataset.from_tensor_slices((
    dict(encoded_data),
    labels
))

validation_dataset = tf.data.Dataset.from_tensor_slices((
    dict(encoded_val_data),
    val_labels
))

# Batch the dataset
batch_size = 128
training_dataset = training_dataset.batch(batch_size)
validation_dataset = validation_dataset.batch(batch_size)

In [19]:
import os
import tensorflow as tf

# Early stopping

def early_stopping():
    return tf.keras.callbacks.EarlyStopping(
        monitor='val_recall',     
        patience=10,             
        mode='max',            
        min_delta=0.001,        
        restore_best_weights=True
    )

# TensorBoard
#def tensorboard(log_dir:str = os.path.join(project_root, "logs", "fit")):
#    return tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# ModelCheckpoint
def model_checkpoint(model_name):
    return tf.keras.callbacks.ModelCheckpoint(
        filepath=os.path.join(project_root, "models", "bert", f"{model_name}"),
        monitor='val_recall',
        save_best_only=True,
        save_weights_only=True,
        save_format='tf',
        mode='max',
        verbose=1
    )

In [20]:
callbacks = [early_stopping(), model_checkpoint("bert_model_test")]

In [22]:
from transformers import TFBertModel, BertTokenizer
import tensorflow as tf


def build_bert_model(loss: list, metrics: list, name:str = "bert_model"):
    # Load the pre-trained BERT model
    bert_model = TFBertModel.from_pretrained('bert-base-uncased')

    # Freeze the BERT model layers
    for layer in bert_model.layers:  # Freeze all layers
        layer.trainable = False

    # Define the input layers
    input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask")

    # Get the output from the BERT model
    bert_outputs = bert_model(input_ids, attention_mask=attention_mask)

    # Use the pooled output for classification
    pooled_output = bert_outputs.pooler_output

    # Add custom layers
    x = tf.keras.layers.Dense(128, activation='relu')(pooled_output)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    output = tf.keras.layers.Dense(2, activation='softmax')(x)

    # Create the model
    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output, name=name)

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.legacy.Adam(),
                  loss=loss,
                  metrics=metrics)

    # Summary of the model
    model.summary()
    
    return model

model = build_bert_model(loss, metrics, "bert_model")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "bert_model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model_4 (TFBertMod  TFBaseModelOutputWithPooli   1094822   ['input_ids[0][0]',           
 el)                         ngAndCrossAttentions(last_   40         'attention_mask[0][0]']      
                             hidden_state=(None, 128, 7                                  

In [23]:
model_history = model.fit(training_dataset, epochs=100, callbacks=callbacks, validation_data=validation_dataset)

Epoch 1/100


  inputs = self._flatten_to_reference_inputs(inputs)
2024-12-02 18:36:51.023782: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 1: val_recall improved from -inf to 0.86064, saving model to /Users/theopinto--dalle/code/arewetoxic/models/bert/bert_model_test
Epoch 2/100
Epoch 2: val_recall improved from 0.86064 to 0.87561, saving model to /Users/theopinto--dalle/code/arewetoxic/models/bert/bert_model_test
Epoch 3/100
Epoch 3: val_recall did not improve from 0.87561
Epoch 4/100
Epoch 4: val_recall improved from 0.87561 to 0.87909, saving model to /Users/theopinto--dalle/code/arewetoxic/models/bert/bert_model_test
Epoch 5/100
Epoch 5: val_recall improved from 0.87909 to 0.87933, saving model to /Users/theopinto--dalle/code/arewetoxic/models/bert/bert_model_test
Epoch 6/100
Epoch 6: val_recall did not improve from 0.87933
Epoch 7/100
Epoch 7: val_recall did not improve from 0.87933
Epoch 8/100
Epoch 8: val_recall did not improve from 0.87933
Epoch 9/100
Epoch 9: val_recall did not improve from 0.87933
Epoch 10/100
Epoch 10: val_recall did not improve from 0.87933
Epoch 11/100
Epoch 11: val_recall did not impro

In [12]:
# Try loading the model by importing weights into empty model
# Create a new model
# Load the pre-trained BERT model
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Freeze the BERT model layers
for layer in bert_model.layers:  # Freeze all layers
    layer.trainable = False

# Define the input layers
input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name="attention_mask")

# Get the output from the BERT model
bert_outputs = bert_model(input_ids, attention_mask=attention_mask)

# Use the pooled output for classification
pooled_output = bert_outputs.pooler_output

# Add custom layers
x = tf.keras.layers.Dense(128, activation='relu')(pooled_output)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(64, activation='relu')(x)
output = tf.keras.layers.Dense(3, activation='softmax')(x)

# Create the model
test_model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile the model
test_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=loss,
              metrics=metrics)
# Load the weights
test_model.load_weights(os.path.join(project_root, "models", "bert_model_test"))

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x38902b6d0>