In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import TFBertForSequenceClassification, BertTokenizer, BertConfig
import tensorflow as tf
import keras_tuner as kt
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.mixed_precision import set_global_policy
set_global_policy('mixed_float16')
import os
import warnings
from transformers import logging as hf_logging

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
warnings.filterwarnings('ignore')
hf_logging.set_verbosity_error()
from tensorflow.keras.callbacks import TensorBoard
import datetime

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [None]:
df.shape

(159571, 8)

In [None]:
class_counts = df[label_columns].sum()

In [None]:
class_counts

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

# Toxic Comment Classification using BERT

In [None]:
df['text'] = df['comment_text']
df['targets'] = df[label_columns].values.tolist()

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].tolist(), df['targets'].tolist(), test_size=0.2, random_state=77)

In [None]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors='tf')

In [None]:
# Tokenize the train and validation texts
train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

In [None]:
# Convert to TensorFlow Dataset format
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    tf.convert_to_tensor(train_labels, dtype=tf.float32)
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    tf.convert_to_tensor(val_labels, dtype=tf.float32)
))

In [None]:
train_dataset = train_dataset.batch(32).shuffle(100).prefetch(tf.data.experimental.AUTOTUNE)
val_dataset = val_dataset.batch(32).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
class BertWrapper(tf.keras.layers.Layer):
    def __init__(self, config):
        super(BertWrapper, self).__init__()
        self.bert = TFBertModel.from_pretrained('bert-base-uncased', config=config)

        # Freeze first 8 encoder layers
        for i in range(8):
            self.bert.bert.encoder.layer[i].trainable = False

    def call(self, inputs):
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.pooler_output


In [None]:
def build_model(hp):
    from transformers import BertConfig
    import tensorflow as tf
    from tensorflow.keras import layers, Model

    config = BertConfig.from_pretrained('bert-base-uncased')
    config.num_labels = 6

    learning_rate = hp.Float('lr', min_value=1e-6, max_value=5e-5, sampling='log')
    dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.3)

    input_ids = layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')

    bert_output = BertWrapper(config)({'input_ids': input_ids, 'attention_mask': attention_mask})

    x = layers.Dropout(dropout_rate)(bert_output)
    output = layers.Dense(config.num_labels, activation='sigmoid')(x)

    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
        metrics=[tf.keras.metrics.AUC(multi_label=True)]
    )

    return model


## Perform Hyperparameter Tuning

In [None]:
# EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1)

In [None]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
tuner = kt.Hyperband(
    build_model,
    objective='val_loss',
    max_epochs=3,
    hyperband_iterations=1,
    directory='bert_toxic_comments_windows',
    project_name='bert_tuning'
)

tuner.search(train_dataset, validation_data=val_dataset, epochs=3, callbacks=[tensorboard_callback, early_stopping, reduce_lr])

Trial 6 Complete [06h 47m 13s]
val_loss: 0.11574476212263107

Best val_loss So Far: 0.11217503994703293
Total elapsed time: 1d 00h 56m 29s


In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print(f"Best learning rate: {best_hps.get('lr')}")
print(f"Best dropout rate: {best_hps.get('dropout_rate')}")

Best learning rate: 4.041831825786239e-05
Best dropout rate: 0.25857560573155236


In [None]:
def final_build_model(hp):
    from transformers import BertConfig
    import tensorflow as tf
    from tensorflow.keras import layers, Model

    config = BertConfig.from_pretrained('bert-base-uncased')
    config.num_labels = 6

    learning_rate = hp.Float('lr', min_value=1e-6, max_value=5e-5, sampling='log')
    dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.3)

    input_ids = layers.Input(shape=(None,), dtype=tf.int32, name='input_ids')
    attention_mask = layers.Input(shape=(None,), dtype=tf.int32, name='attention_mask')

    bert_output = BertWrapper(config)({'input_ids': input_ids, 'attention_mask': attention_mask})

    x = layers.Dropout(dropout_rate)(bert_output)
    output = layers.Dense(config.num_labels, activation='sigmoid')(x)

    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
        metrics=[tf.keras.metrics.BinaryAccuracy(name='accuracy', threshold=0.5), tf.keras.metrics.AUC(multi_label=True)]
    )

    return model


In [None]:
# Build the model with best hyperparameters
best_model = final_build_model(best_hps)

# Train the model with the best hyperparameters
best_model.fit(train_dataset, validation_data=val_dataset, epochs=5, callbacks=[tensorboard_callback, early_stopping, reduce_lr])

Epoch 1/5
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8186s[0m 2s/step - accuracy: 0.9276 - auc_4: 0.5381 - loss: 0.2116 - val_accuracy: 0.9633 - val_auc_4: 0.7197 - val_loss: 0.1276 - learning_rate: 4.0418e-05
Epoch 2/5
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8311s[0m 2s/step - accuracy: 0.9633 - auc_4: 0.6704 - loss: 0.1301 - val_accuracy: 0.9635 - val_auc_4: 0.7883 - val_loss: 0.1179 - learning_rate: 4.0418e-05
Epoch 3/5
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8336s[0m 2s/step - accuracy: 0.9634 - auc_4: 0.7526 - loss: 0.1209 - val_accuracy: 0.9635 - val_auc_4: 0.8320 - val_loss: 0.1112 - learning_rate: 4.0418e-05
Epoch 4/5
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8318s[0m 2s/step - accuracy: 0.9638 - auc_4: 0.7840 - loss: 0.1148 - val_accuracy: 0.9640 - val_auc_4: 0.8553 - val_loss: 0.1060 - learning_rate: 4.0418e-05
Epoch 5/5
[1m3990/3990[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8273s[0m 2s/

<keras.src.callbacks.history.History at 0x26f26baaab0>

In [None]:
results = best_model.evaluate(val_dataset)
for name, value in zip(best_model.metrics_names, results):
    print(f"{name}: {value}")

[1m998/998[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1659s[0m 2s/step - accuracy: 0.9648 - auc_4: 0.8760 - loss: 0.1025
loss: 0.10189127922058105
compile_metrics: 0.9646661877632141
