# Multimodal Hate Speech Classification and Cyberbullying Detection

This notebook implements a multimodal model combining image and text features for hate speech detection and cyberbullying classification.

## Data Preprocessing


### Load Annotations and Prepare Data

In [5]:
import json
import pandas as pd
import os
from PIL import Image
import matplotlib.pyplot as plt
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Input, Embedding, LSTM, Dropout, GlobalAveragePooling2D, Concatenate
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split


In [6]:
# Load the annotations
with open('./multimodal-hate-speech/MMHS150K_GT.json', 'r') as f:
    annotations = json.load(f)


In [7]:
# Convert the JSON dict to a DataFrame
data = []
for tweet_id, info in annotations.items():
    data.append({
        'tweet_id': tweet_id,
        'tweet_text': info['tweet_text'],
        'labels': info['labels'],
        'labels_str': info['labels_str']
    })

df = pd.DataFrame(data)

In [8]:
df.head()

Unnamed: 0,tweet_id,tweet_text,labels,labels_str
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]"
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]"
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]"
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]"
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]"


# Add Image Path and Create Majority Label

In [9]:

# Path to the image folder
image_folder = './multimodal-hate-speech/img_resized'

In [10]:
# Add image path column
df['image_path'] = df['tweet_id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))


In [11]:
# Define majority vote function for labels
from collections import Counter

def majority_vote(labels):
    label_count = Counter(labels)
    return label_count.most_common(1)[0][0]

In [12]:
# Apply majority vote to create a single label column
df['majority_label'] = df['labels'].apply(majority_vote)

label_mapping = {
    0: "NotHate",
    1: "Racist",
    2: "Sexist",
    3: "Homophobe",
    4: "Religion",
    5: "OtherHate"
}

In [13]:
df.head()

Unnamed: 0,tweet_id,tweet_text,labels,labels_str,image_path,majority_label
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]",./multimodal-hate-speech/img_resized/111467935...,4
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]",./multimodal-hate-speech/img_resized/106302004...,5
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]",./multimodal-hate-speech/img_resized/110892736...,0
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]",./multimodal-hate-speech/img_resized/111455853...,0
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]",./multimodal-hate-speech/img_resized/103525248...,1


In [14]:
# Create a new column 'majority_label_str' with the string representation of the majority label
df['majority_label_str'] = df['majority_label'].map(label_mapping)


In [15]:
df.head()

Unnamed: 0,tweet_id,tweet_text,labels,labels_str,image_path,majority_label,majority_label_str
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]",./multimodal-hate-speech/img_resized/111467935...,4,Religion
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]",./multimodal-hate-speech/img_resized/106302004...,5,OtherHate
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]",./multimodal-hate-speech/img_resized/110892736...,0,NotHate
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]",./multimodal-hate-speech/img_resized/111455853...,0,NotHate
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]",./multimodal-hate-speech/img_resized/103525248...,1,Racist


# Text Preprocessing

In [16]:
# Preprocess text data
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [17]:
# Apply text preprocessing
df['cleaned_text'] = df['tweet_text'].apply(preprocess_text)


# Train-Test Splitting and Sampling

In [18]:
# Check the group sizes
df['majority_label'].value_counts()

majority_label
0    116790
1     14183
5      8196
2      5375
3      4926
4       353
Name: count, dtype: int64

In [19]:
# Get the minimum group size
min_group_size = df['majority_label'].value_counts().min()
min_group_size

np.int64(353)

In [21]:
sampled_df = df.sample(n=50000, random_state=42)

sampled_df['majority_label'].value_counts()

majority_label
0    38936
1     4750
5     2746
2     1812
3     1645
4      111
Name: count, dtype: int64

In [22]:
train_data, temp_data = train_test_split(sampled_df, test_size=0.4, random_state=42)

val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)


In [23]:
# Print the sizes of train and test sets
print("Training data size:", train_data.shape)
print("Validation data size:", val_data.shape)
print("Testing data size:", test_data.shape)

Training data size: (30000, 8)
Validation data size: (10000, 8)
Testing data size: (10000, 8)


In [24]:
train_data['majority_label'].value_counts()

majority_label
0    23377
1     2839
5     1629
2     1129
3      963
4       63
Name: count, dtype: int64

In [27]:
# Tokenize text
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['cleaned_text'])
X_train_text = pad_sequences(tokenizer.texts_to_sequences(train_data['cleaned_text']), maxlen=100)
X_val_text = pad_sequences(tokenizer.texts_to_sequences(val_data['cleaned_text']), maxlen=100)
X_test_text = pad_sequences(tokenizer.texts_to_sequences(test_data['cleaned_text']), maxlen=100)

In [28]:
# Load and preprocess images
def load_and_preprocess_image(img_path, target_size=(224, 224)):
    try:
        if not os.path.exists(img_path):
            return np.zeros((target_size[0], target_size[1], 3))
        img = load_img(img_path, target_size=target_size)
        img = img_to_array(img) / 255.0
        return img
    except Exception as e:
        return np.zeros((target_size[0], target_size[1], 3))

In [29]:
X_train_image = np.array([load_and_preprocess_image(path) for path in train_data['image_path']])


In [30]:
X_val_image = np.array([load_and_preprocess_image(path) for path in val_data['image_path']])


In [31]:
X_test_image = np.array([load_and_preprocess_image(path) for path in test_data['image_path']])

In [32]:
# Labels
y_train = np.array(train_data['majority_label'])
y_val = np.array(val_data['majority_label'])
y_test = np.array(test_data['majority_label'])

# Model Building: Multimodal Model

## Image Model

In [33]:
# Define image model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
for layer in base_model.layers:
    layer.trainable = False

image_input = Input(shape=(224, 224, 3))
x_image = base_model(image_input, training=False)
x_image = GlobalAveragePooling2D()(x_image)
x_image = Dense(256, activation='relu')(x_image)
x_image = Dropout(0.5)(x_image)

## Text Model

In [34]:
# Define text model
text_input = Input(shape=(100,))
x_text = Embedding(input_dim=20000, output_dim=128, input_length=100)(text_input)
x_text = LSTM(128, return_sequences=False)(x_text)
x_text = Dense(128, activation='relu')(x_text)
x_text = Dropout(0.5)(x_text)



## Combined Multimodal Model

In [35]:
# Combine image and text features
combined = Concatenate()([x_image, x_text])
x_combined = Dense(128, activation='relu')(combined)
x_combined = Dropout(0.5)(x_combined)
output = Dense(len(label_mapping), activation='softmax')(x_combined)

In [36]:
# Build model
multimodal_model = Model(inputs=[image_input, text_input], outputs=output)
multimodal_model.compile(optimizer=Adam(learning_rate=1e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [37]:
# Print Model Summary
multimodal_model.summary()

# Training the Model

In [None]:
# Train the model
history = multimodal_model.fit(
    [X_train_image, X_train_text], y_train,
    validation_data=([X_val_image, X_val_text], y_val),
    epochs=10,
    batch_size=128,
    verbose=1
)

In [None]:
# Save the entire binary model
multimodal_model.save('multimodel_model_updated.h5') 

In [None]:
# Evaluate the model
loss, accuracy = multimodal_model.evaluate([X_test_image, X_test_text], y_test, verbose=1)
print(f"Test Accuracy: {accuracy * 100:.2f}%")