# Multimodal Hate Speech Classification and Cyberbullying Detection

This notebook implements a multimodal model combining image and text features for hate speech detection and cyberbullying classification.

## Data Preprocessing


### Load Annotations and Prepare Data

In [34]:
import json
import pandas as pd
import os
from PIL import Image
import matplotlib.pyplot as plt
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Input, Dropout, GlobalAveragePooling2D, Concatenate
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split


from transformers import BertTokenizer, TFBertModel
import torch

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')



Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [35]:
# Load the annotations
with open('./multimodal-hate-speech/MMHS150K_GT.json', 'r') as f:
    annotations = json.load(f)


In [36]:
# Convert the JSON dict to a DataFrame
data = []
for tweet_id, info in annotations.items():
    data.append({
        'tweet_id': tweet_id,
        'tweet_text': info['tweet_text'],
        'labels': info['labels'],
        'labels_str': info['labels_str']
    })

df = pd.DataFrame(data)

In [37]:
df.head()

Unnamed: 0,tweet_id,tweet_text,labels,labels_str
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]"
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]"
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]"
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]"
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]"


# Add Image Path and Create Majority Label

In [38]:

# Path to the image folder
image_folder = './multimodal-hate-speech/img_resized'

In [39]:
# Add image path column
df['image_path'] = df['tweet_id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))


In [40]:
# Define majority vote function for labels
from collections import Counter

def majority_vote(labels):
    label_count = Counter(labels)
    return label_count.most_common(1)[0][0]

In [41]:
# Apply majority vote to create a single label column
df['majority_label'] = df['labels'].apply(majority_vote)

label_mapping = {
    0: "NotHate",
    1: "Racist",
    2: "Sexist",
    3: "Homophobe",
    4: "Religion",
    5: "OtherHate"
}

In [42]:
df.head()

Unnamed: 0,tweet_id,tweet_text,labels,labels_str,image_path,majority_label
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]",./multimodal-hate-speech/img_resized/111467935...,4
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]",./multimodal-hate-speech/img_resized/106302004...,5
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]",./multimodal-hate-speech/img_resized/110892736...,0
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]",./multimodal-hate-speech/img_resized/111455853...,0
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]",./multimodal-hate-speech/img_resized/103525248...,1


In [43]:
# Create a new column 'majority_label_str' with the string representation of the majority label
df['majority_label_str'] = df['majority_label'].map(label_mapping)


In [44]:
df.head()

Unnamed: 0,tweet_id,tweet_text,labels,labels_str,image_path,majority_label,majority_label_str
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]",./multimodal-hate-speech/img_resized/111467935...,4,Religion
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]",./multimodal-hate-speech/img_resized/106302004...,5,OtherHate
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]",./multimodal-hate-speech/img_resized/110892736...,0,NotHate
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]",./multimodal-hate-speech/img_resized/111455853...,0,NotHate
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]",./multimodal-hate-speech/img_resized/103525248...,1,Racist


# Text Preprocessing

In [45]:
def preprocess_text_bert(text):
    # Tokenize the text
    inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="tf")
    # Get BERT embeddings
    outputs = bert_model(inputs)
    # Use the [CLS] token embedding (first token)
    return outputs.last_hidden_state[:, 0, :].numpy()


In [46]:
# Preprocess text data
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [53]:
def preprocess_text_bert(texts, batch_size=32):
    if isinstance(texts, str):
        texts = [texts]
    
    all_embeddings = []
    total_batches = len(texts) // batch_size + (1 if len(texts) % batch_size != 0 else 0)
    
    print(f"Processing {len(texts)} texts in {total_batches} batches...")
    
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        
        # Tokenize the batch
        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=64,
            return_tensors="tf"
        )
        
        # Get BERT embeddings for the batch
        outputs = bert_model(inputs)
        
        # Use [CLS] token embedding (first token)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        all_embeddings.append(batch_embeddings)
        
        # Print progress
        if (i // batch_size) % 10 == 0:
            print(f"Processed {i} texts...")
    
    # Concatenate all batch embeddings
    final_embeddings = np.concatenate(all_embeddings, axis=0)
    print(f"Completed processing {len(final_embeddings)} texts")
    return final_embeddings


In [57]:
sampled_df = df.sample(n=50000, random_state=42)


In [58]:
# Apply text preprocessing
sampled_df['cleaned_text'] = sampled_df['tweet_text'].apply(preprocess_text)


# Train-Test Splitting and Sampling

In [59]:
embeddings = preprocess_text_bert(sampled_df['cleaned_text'].tolist(), batch_size=32)


Processing 50000 texts in 1563 batches...
Processed 0 texts...
Processed 320 texts...
Processed 640 texts...
Processed 960 texts...
Processed 1280 texts...
Processed 1600 texts...
Processed 1920 texts...
Processed 2240 texts...
Processed 2560 texts...
Processed 2880 texts...
Processed 3200 texts...
Processed 3520 texts...
Processed 3840 texts...
Processed 4160 texts...
Processed 4480 texts...
Processed 4800 texts...
Processed 5120 texts...
Processed 5440 texts...
Processed 5760 texts...
Processed 6080 texts...
Processed 6400 texts...
Processed 6720 texts...
Processed 7040 texts...
Processed 7360 texts...
Processed 7680 texts...
Processed 8000 texts...
Processed 8320 texts...
Processed 8640 texts...
Processed 8960 texts...
Processed 9280 texts...
Processed 9600 texts...
Processed 9920 texts...
Processed 10240 texts...
Processed 10560 texts...
Processed 10880 texts...
Processed 11200 texts...
Processed 11520 texts...
Processed 11840 texts...
Processed 12160 texts...
Processed 12480 texts

In [65]:
# Check the group sizes
sampled_df['majority_label'].value_counts()

majority_label
0    38936
1     4750
5     2746
2     1812
3     1645
4      111
Name: count, dtype: int64

In [73]:

sampled_df = sampled_df.reset_index()

In [75]:
train_data, temp_data = train_test_split(sampled_df, test_size=0.4, random_state=42)

val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)


In [76]:
# Print the sizes of train and test sets
print("Training data size:", train_data.shape)
print("Validation data size:", val_data.shape)
print("Testing data size:", test_data.shape)

Training data size: (30000, 9)
Validation data size: (10000, 9)
Testing data size: (10000, 9)


In [77]:
# Get the indices for each split
train_indices = train_data.index
val_indices = val_data.index
test_indices = test_data.index

# Use these indices to get the corresponding embeddings
X_train_text = embeddings[train_indices]
X_val_text = embeddings[val_indices]
X_test_text = embeddings[test_indices]


In [78]:

# Load and preprocess images for each split


In [79]:
# Load and preprocess images
def load_and_preprocess_image(img_path, target_size=(224, 224)):
    try:
        if not os.path.exists(img_path):
            return np.zeros((target_size[0], target_size[1], 3))
        img = load_img(img_path, target_size=target_size)
        img = img_to_array(img) / 255.0
        return img
    except Exception as e:
        return np.zeros((target_size[0], target_size[1], 3))

In [80]:
X_train_image = np.array([load_and_preprocess_image(path) for path in train_data['image_path']])


In [81]:
X_val_image = np.array([load_and_preprocess_image(path) for path in val_data['image_path']])


In [82]:
X_test_image = np.array([load_and_preprocess_image(path) for path in test_data['image_path']])


In [83]:

# Get labels for each split
y_train = np.array(train_data['majority_label'])
y_val = np.array(val_data['majority_label'])
y_test = np.array(test_data['majority_label'])

In [84]:
X_train_image.shape

(30000, 224, 224, 3)

# Model Building: Multimodal Model

## Image Model

In [85]:
# Define image model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
for layer in base_model.layers:
    layer.trainable = False

image_input = Input(shape=(224, 224, 3))
x_image = base_model(image_input, training=False)
x_image = GlobalAveragePooling2D()(x_image)
x_image = Dense(256, activation='relu')(x_image)
x_image = Dropout(0.5)(x_image)

## Text Model

In [86]:
# Define text model
text_input = Input(shape=(768,))  # BERT base output dimension
x_text = Dense(256, activation='relu')(text_input)
x_text = Dropout(0.5)(x_text)


## Combined Multimodal Model

In [87]:
# Combine image and text features
combined = Concatenate()([x_image, x_text])
x_combined = Dense(128, activation='relu')(combined)
x_combined = Dropout(0.5)(x_combined)
output = Dense(len(label_mapping), activation='softmax')(x_combined)

In [88]:
# Build model
multimodal_model = Model(inputs=[image_input, text_input], outputs=output)
multimodal_model.compile(optimizer=Adam(learning_rate=1e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [89]:
# Print Model Summary
multimodal_model.summary()

# Training the Model

In [None]:
# Train the model
history = multimodal_model.fit(
    [X_train_image, X_train_text], y_train,
    validation_data=([X_test_image, X_test_text], y_test),
    epochs=10,
    batch_size=128,
    verbose=1
)

Epoch 1/10
[1m 46/235[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m8:57[0m 3s/step - accuracy: 0.6823 - loss: 1.2185

In [None]:
# Save the entire binary model
multimodal_model.save('multimodel_model_updated.h5') 

In [None]:
# Evaluate the model
loss, accuracy = multimodal_model.evaluate([X_test_image, X_test_text], y_test, verbose=1)
print(f"Test Accuracy: {accuracy * 100:.2f}%")