In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install pyspark
!pip install findspark



In [3]:
import json
import pandas as pd
import os
from PIL import Image
import matplotlib.pyplot as plt
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Input, Embedding, LSTM, Dropout, GlobalAveragePooling2D, Concatenate
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

In [4]:
# too much overhead - takes very long

# # Read the JSON file into a Spark DataFrame
# json_df = spark.read.json("/kaggle/input/multimodal-hate-speech/MMHS150K_GT.json")

# # Show the DataFrame (optional)
# json_df.show()

# # If you want to see the schema
# json_df.printSchema()

In [5]:
with open('/kaggle/input/multimodal-hate-speech/MMHS150K_GT.json', 'r') as f:
    annotations = json.load(f)

In [6]:
# Convert the JSON dict to a DataFrame
data = []
for tweet_id, info in annotations.items():
    data.append({
        'tweet_id': tweet_id,
        'tweet_text': info['tweet_text'],
        'labels': info['labels'],
        'labels_str': info['labels_str']
    })

df = pd.DataFrame(data)

In [7]:
df.head()

Unnamed: 0,tweet_id,tweet_text,labels,labels_str
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]"
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]"
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]"
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]"
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]"


In [8]:
# Path to the image folder
image_folder = '/kaggle/input/multimodal-hate-speech/img_resized'
# Add image path column
df['image_path'] = df['tweet_id'].apply(lambda x: os.path.join(image_folder, f"{x}.jpg"))

In [9]:
# Define majority vote function for labels
from collections import Counter

def majority_vote(labels):
    label_count = Counter(labels)
    return label_count.most_common(1)[0][0]

In [10]:
# Apply majority vote to create a single label column
df['majority_label'] = df['labels'].apply(majority_vote)

label_mapping = {
    0: "NotHate",
    1: "Racist",
    2: "Sexist",
    3: "Homophobe",
    4: "Religion",
    5: "OtherHate"
}

In [11]:
# Create a new column 'majority_label_str' with the string representation of the majority label
df['majority_label_str'] = df['majority_label'].map(label_mapping)

In [12]:
df.head()

Unnamed: 0,tweet_id,tweet_text,labels,labels_str,image_path,majority_label,majority_label_str
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]",/kaggle/input/multimodal-hate-speech/img_resiz...,4,Religion
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,5,OtherHate
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,0,NotHate
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,0,NotHate
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]",/kaggle/input/multimodal-hate-speech/img_resiz...,1,Racist


In [13]:
df.shape

(149823, 7)

In [14]:
# Preprocess text data
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

# Apply text preprocessing
df['cleaned_text'] = df['tweet_text'].apply(preprocess_text)
df.head()

Unnamed: 0,tweet_id,tweet_text,labels,labels_str,image_path,majority_label,majority_label_str,cleaned_text
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]",/kaggle/input/multimodal-hate-speech/img_resiz...,4,Religion,nigga
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,5,OtherHate,my horses are retarded
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,0,NotHate,nigga on ma momma youngboy be spitting real sh...
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,0,NotHate,rt xxsugvngxx i ran into this holy nigga today
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]",/kaggle/input/multimodal-hate-speech/img_resiz...,1,Racist,everybody calling you nigger now


In [15]:
# Check the group sizes
df['majority_label'].value_counts()

majority_label
0    116790
1     14183
5      8196
2      5375
3      4926
4       353
Name: count, dtype: int64

In [16]:
df['binary_label'] = df['majority_label'].apply(lambda x: 1 if x in [2,3,4,5] else x)

In [17]:
df.head()

Unnamed: 0,tweet_id,tweet_text,labels,labels_str,image_path,majority_label,majority_label_str,cleaned_text,binary_label
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]",/kaggle/input/multimodal-hate-speech/img_resiz...,4,Religion,nigga,1
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,5,OtherHate,my horses are retarded,1
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,0,NotHate,nigga on ma momma youngboy be spitting real sh...,0
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,0,NotHate,rt xxsugvngxx i ran into this holy nigga today,0
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]",/kaggle/input/multimodal-hate-speech/img_resiz...,1,Racist,everybody calling you nigger now,1


In [18]:
df['binary_label_str'] = df['majority_label_str'].apply(lambda x: 'Hate' if x in ['Religion', 'OtherHate','Racist', 'Sexist','Homophobe'] else x)

In [19]:
df.head()

Unnamed: 0,tweet_id,tweet_text,labels,labels_str,image_path,majority_label,majority_label_str,cleaned_text,binary_label,binary_label_str
0,1114679353714016256,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,"[4, 1, 3]","[Religion, Racist, Homophobe]",/kaggle/input/multimodal-hate-speech/img_resiz...,4,Religion,nigga,1,Hate
1,1063020048816660480,My horses are retarded https://t.co/HYhqc6d5WN,"[5, 5, 5]","[OtherHate, OtherHate, OtherHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,5,OtherHate,my horses are retarded,1,Hate
2,1108927368075374593,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,"[0, 0, 0]","[NotHate, NotHate, NotHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,0,NotHate,nigga on ma momma youngboy be spitting real sh...,0,NotHate
3,1114558534635618305,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,"[1, 0, 0]","[Racist, NotHate, NotHate]",/kaggle/input/multimodal-hate-speech/img_resiz...,0,NotHate,rt xxsugvngxx i ran into this holy nigga today,0,NotHate
4,1035252480215592966,“EVERYbody calling you Nigger now!” https://t....,"[1, 0, 1]","[Racist, NotHate, Racist]",/kaggle/input/multimodal-hate-speech/img_resiz...,1,Racist,everybody calling you nigger now,1,Hate


In [20]:
df.drop(['labels','labels_str','majority_label','majority_label_str','tweet_text'], axis=1, inplace=True)

In [21]:
df.head()

Unnamed: 0,tweet_id,image_path,cleaned_text,binary_label,binary_label_str
0,1114679353714016256,/kaggle/input/multimodal-hate-speech/img_resiz...,nigga,1,Hate
1,1063020048816660480,/kaggle/input/multimodal-hate-speech/img_resiz...,my horses are retarded,1,Hate
2,1108927368075374593,/kaggle/input/multimodal-hate-speech/img_resiz...,nigga on ma momma youngboy be spitting real sh...,0,NotHate
3,1114558534635618305,/kaggle/input/multimodal-hate-speech/img_resiz...,rt xxsugvngxx i ran into this holy nigga today,0,NotHate
4,1035252480215592966,/kaggle/input/multimodal-hate-speech/img_resiz...,everybody calling you nigger now,1,Hate


In [22]:
df['binary_label'].value_counts()

binary_label
0    116790
1     33033
Name: count, dtype: int64

In [23]:
from sklearn.model_selection import train_test_split

# Separate the 'Hate' and 'NotHate' classes
hate_df = df[df['binary_label_str'] == 'Hate']
not_hate_df = df[df['binary_label_str'] == 'NotHate']

# Sample 33,000 records from each class
hate_sampled = hate_df.sample(n=33000, random_state=42)
not_hate_sampled = not_hate_df.sample(n=33000, random_state=42)

# Combine the sampled DataFrames
balanced_df = pd.concat([hate_sampled, not_hate_sampled])

# Split the balanced DataFrame into training and testing sets
train_data, test_data = train_test_split(balanced_df, test_size=0.3, random_state=42, stratify=balanced_df['binary_label_str'])

# Verify the distribution
print("Train Data Label Distribution:\n", train_data['binary_label_str'].value_counts())
print("\nTest Data Label Distribution:\n", test_data['binary_label_str'].value_counts())

Train Data Label Distribution:
 binary_label_str
Hate       23100
NotHate    23100
Name: count, dtype: int64

Test Data Label Distribution:
 binary_label_str
Hate       9900
NotHate    9900
Name: count, dtype: int64


In [24]:
# Tokenize text
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['cleaned_text'])
X_train_text = pad_sequences(tokenizer.texts_to_sequences(train_data['cleaned_text']), maxlen=100)
X_test_text = pad_sequences(tokenizer.texts_to_sequences(test_data['cleaned_text']), maxlen=100)

In [25]:
# Load and preprocess - resize images
def load_and_preprocess_image(img_path, target_size=(224, 224)):
    try:
        if not os.path.exists(img_path):
            return np.zeros((target_size[0], target_size[1], 3))
        img = load_img(img_path, target_size=target_size)
        img = img_to_array(img) / 255.0
        return img
    except Exception as e:
        return np.zeros((target_size[0], target_size[1], 3))

In [26]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ImageLoading").getOrCreate()

In [27]:
train_spark_df = spark.createDataFrame(train_data)
test_spark_df = spark.createDataFrame(test_data)

In [28]:
train_spark_df.head()

Row(tweet_id='1055877405905043461', image_path='/kaggle/input/multimodal-hate-speech/img_resized/1055877405905043461.jpg', cleaned_text=' he pull his race card again ', binary_label=1, binary_label_str='Hate')

In [29]:
import cv2  # Or PIL (Pillow)
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, IntegerType

def load_image(image_path):
    try:
        image = cv2.imread(image_path) 
        if image is not None:
            return image.flatten().tolist() #Returns a flattened array.
        else:
            return None
    except Exception as e:
        print(f"Error loading image: {e}")
        return None

load_image_udf = udf(load_image, ArrayType(IntegerType())) #The return type will depend on the image and your needs.

train_spark_df = train_spark_df.withColumn("image_data", load_image_udf("image_path"))
test_spark_df = test_spark_df.withColumn("image_data", load_image_udf("image_path"))

In [30]:
train_spark_df.head()

Row(tweet_id='1055877405905043461', image_path='/kaggle/input/multimodal-hate-speech/img_resized/1055877405905043461.jpg', cleaned_text=' he pull his race card again ', binary_label=1, binary_label_str='Hate', image_data=[27, 244, 255, 22, 241, 255, 7, 229, 234, 9, 243, 239, 2, 254, 237, 0, 241, 217, 0, 243, 216, 0, 255, 229, 0, 252, 227, 0, 249, 229, 2, 247, 233, 10, 246, 236, 13, 247, 237, 8, 248, 237, 0, 249, 234, 0, 250, 233, 0, 247, 229, 0, 247, 229, 0, 248, 230, 0, 248, 230, 0, 247, 231, 0, 247, 232, 0, 247, 232, 0, 247, 232, 1, 245, 234, 2, 246, 235, 3, 247, 236, 3, 249, 237, 4, 249, 239, 3, 250, 240, 2, 252, 241, 2, 252, 241, 4, 249, 239, 3, 248, 238, 3, 248, 238, 2, 247, 237, 2, 246, 238, 2, 246, 238, 3, 247, 239, 3, 247, 239, 2, 246, 240, 2, 246, 240, 1, 245, 239, 1, 245, 239, 0, 244, 238, 0, 244, 238, 0, 243, 237, 0, 242, 237, 2, 243, 235, 4, 242, 235, 3, 243, 238, 4, 244, 239, 5, 244, 241, 6, 245, 242, 4, 245, 242, 5, 246, 243, 4, 245, 242, 4, 246, 241, 3, 247, 241, 2, 247,

so, pyspark MLlib does not have much support for deep learning models.

In [31]:
train_pandas_df = train_spark_df.toPandas()
test_pandas_df = test_spark_df.toPandas()

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 34166)
Traceback (most recent call last):
  File "/usr/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/usr/local/lib/python3.10/dist-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/lib/python3.10/dist-packages/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
  File "/usr/local/lib/python3.10/dist-packages/pyspark/accumulators.py", line 271, in accum_updates
    num_updates = read_int(

ConnectionRefusedError: [Errno 111] Connection refused