The code below imports the necessary libraries for this project.

In [1]:
!pip install pandas numpy scikit-learn transformers torch xgboost imblearn tqdm

import pandas as pd
import numpy as np
import re
import random
import os
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

from transformers import BertTokenizer, BertModel

from imblearn.over_sampling import SMOTE



The code below loads the data and cleans it.

In [2]:
# Generate random seeds for reporductibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Load the data
trainDf = pd.read_csv("train_E6oV3lV.csv")
testDf = pd.read_csv("test_tweets_anuFYb8.csv")

# Function for cleaning the data
def cleanTweet(tweet):
    tweet = tweet.lower()
    tweet = tweet.replace("@user", "")
    tweet = tweet.replace("#", "")
    tweet = re.sub(r"http\S+", "", tweet)
    tweet = re.sub(r"[^a-z0-9\s]", "", tweet)
    tweet = re.sub(r"\s+", " ", tweet).strip()
    return tweet

# Clean the data
trainDf["cleanTweet"] = trainDf["tweet"].apply(cleanTweet)
testDf["cleanTweet"] = testDf["tweet"].apply(cleanTweet)

The code below implements the BERT form of featurization.

In [3]:
# Use MPS for faster embeddings generated
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased").to(device)
model.eval()

# Dataset class for BERT embeddings
class TweetDataset(Dataset):
    def __init__(self, texts, tokenizer, maxLen = 64):
        self.texts = texts
        self.tokenizer = tokenizer
        self.maxLen = maxLen

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation = True,
            padding = "max_length",
            max_length = self.maxLen,
            return_tensors = "pt"
        )
        return {key: val.squeeze(0) for key, val in encoding.items()}

def generateEmbeddings(tweets, tokenizer, model, device, path = "BERTEmbeddings.pt"):
    print("Generating embeddings...")

    # Instantiate dataset and dataloader
    dataset = TweetDataset(tweets, tokenizer)
    loader = DataLoader(dataset, batch_size = 16)
                        
    # Collect embeddings
    allEmbeddings = []

    # Generate [CLS] emebeddings batch-by-batch and store them
    with torch.no_grad():
        for batch in tqdm(loader, desc = "Generating BERT Embeddings"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            clsEmbeddings = outputs.last_hidden_state[:, 0, :].cpu()
            allEmbeddings.append(clsEmbeddings)

    # Concatenate to final matrix
    Xbert = torch.cat(allEmbeddings)
    print(f"Shape of Final Embedding Matrix: {Xbert.shape}")

    # Save to cache
    torch.save(Xbert, path)
    print(f"Embeddings saved to {path}")

    return Xbert.cpu().numpy()

# Generate embeddings for train set
tweets = trainDf["cleanTweet"].tolist()
Xbert = generateEmbeddings(tweets, tokenizer, model, device)

Generating embeddings...


Generating BERT Embeddings: 100%|██████████| 1998/1998 [08:34<00:00,  3.88it/s]


Shape of Final Embedding Matrix: torch.Size([31962, 768])
Embeddings saved to BERTEmbeddings.pt


The code below implements the Linear Model family, Ensemble Model family, and Boosting Model family.

In [4]:
# Prepare target labels
ybert = trainDf["label"].values

# Split train/validation
XTrain, XVal, yTrain, yVal = train_test_split(Xbert, ybert, test_size = 0.2, random_state = 42)

# Balance training set skewed towards non-hate speech labels
smote = SMOTE(random_state = 42, k_neighbors = 3)
XTrainSMOTE, yTrainSMOTE = smote.fit_resample(XTrain, yTrain)

# Linear Model: Logistic Regression
print("\n1. Linear Model Family - Logistic Regression\n")
logReg = LogisticRegression(
    max_iter = 5000,
    solver = "liblinear",
    C = 1.0,
    random_state = 42
)

with tqdm(total = 1, desc = "Logisitic Regression Training") as pbar:
    logReg.fit(XTrainSMOTE, yTrainSMOTE)
    pbar.update(1)

with tqdm(total = 1, desc = "Logistic Regression Prediction)") as pbar:
    yPredLogReg = logReg.predict(XVal)
    pbar.update(1)
    
print(f"Logistic Regression Accuracy: {accuracy_score(yVal, yPredLogReg):.4f}")
print(classification_report(yVal, yPredLogReg, target_names = ["non-hate", "hate"]))

# Ensemble Model: Random Forest
print("\n2. Ensemble Model Family - Random Forest\n")
rf = RandomForestClassifier(
    n_estimators = 50,
    max_depth = 10,
    min_samples_split = 5,
    min_samples_leaf = 2,
    n_jobs = -1,
    random_state = 42
)

with tqdm(total = 1, desc = "Random Forest Training") as pbar:
    rf.fit(XTrainSMOTE, yTrainSMOTE)
    pbar.update(1)

with tqdm(total = 1, desc = "Random Forest Prediction") as pbar:
    yPredRF = rf.predict(XVal)
    pbar.update(1)

print(f"Random Forest Accuracy: {accuracy_score(yVal, yPredRF):.4f}")
print(classification_report(yVal, yPredRF, target_names = ["non-hate", "hate"]))

# Boosting Model: XGBoost
print("\n3. Ensemble Model Family - XGBoost\n")
xgb = XGBClassifier(
    n_estimators = 50,
    max_depth = 4,
    learning_rate = 0.2,
    subsample = 0.8,
    colsample_bytree = 0.8,
    eval_metric = "logloss",
    random_state = 42,
    n_jobs = -1,
    verbosity = 0
)

with tqdm(total = 1, desc = "XGBoost Training") as pbar:
    xgb.fit(XTrainSMOTE, yTrainSMOTE)
    pbar.update(1)

with tqdm(total = 1, desc = "XGBoost Prediction") as pbar:
    yPredXGB = xgb.predict(XVal)
    pbar.update(1)

print(f"XGBoost Accuracy: {accuracy_score(yVal, yPredXGB):.4f}")
print(classification_report(yVal, yPredXGB, target_names = ["non-hate", "hate"]))


1. Linear Model Family - Logistic Regression



Logisitic Regression Training: 100%|██████████| 1/1 [00:33<00:00, 33.45s/it]
Logistic Regression Prediction): 100%|██████████| 1/1 [00:00<00:00, 10.82it/s]


Logistic Regression Accuracy: 0.8932
              precision    recall  f1-score   support

    non-hate       0.98      0.90      0.94      5937
        hate       0.38      0.77      0.51       456

    accuracy                           0.89      6393
   macro avg       0.68      0.84      0.72      6393
weighted avg       0.94      0.89      0.91      6393


2. Ensemble Model Family - Random Forest



Random Forest Training: 100%|██████████| 1/1 [00:16<00:00, 16.28s/it]
Random Forest Prediction: 100%|██████████| 1/1 [00:00<00:00, 24.59it/s]


Random Forest Accuracy: 0.9205
              precision    recall  f1-score   support

    non-hate       0.97      0.94      0.96      5937
        hate       0.46      0.68      0.55       456

    accuracy                           0.92      6393
   macro avg       0.72      0.81      0.75      6393
weighted avg       0.94      0.92      0.93      6393


3. Ensemble Model Family - XGBoost



XGBoost Training: 100%|██████████| 1/1 [00:06<00:00,  6.46s/it]
XGBoost Prediction: 100%|██████████| 1/1 [00:00<00:00, 55.38it/s]

XGBoost Accuracy: 0.9019
              precision    recall  f1-score   support

    non-hate       0.98      0.92      0.95      5937
        hate       0.40      0.73      0.52       456

    accuracy                           0.90      6393
   macro avg       0.69      0.82      0.73      6393
weighted avg       0.94      0.90      0.91      6393






The code below implements the MLP Deep Learning Model.

In [5]:
# Change device to CPU since MPS freezes on last few batches
device = torch.device("cpu")

# Create MLP Deep Learning Model
class MLP(nn.Module):
    def __init__(self, inputDim = 768, hiddenDim = 256, numClasses = 2, dropoutRate = 0.3):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(inputDim, hiddenDim)
        self.dropout = nn.Dropout(dropoutRate)
        self.fc2 = nn.Linear(hiddenDim, numClasses)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

# Train the MLP Model
def trainMLP(XTrainSMOTE, yTrainSMOTE, XVal, yVal):
    # Set device to CPU, as MPS freezes at last couple of batches
    device = torch.device("cpu")

    # Convert to tensors
    XTrainTensor = torch.from_numpy(XTrainSMOTE.astype(np.float32))
    yTrainTensor = torch.from_numpy(yTrainSMOTE.astype(np.int64))
    XValTensor = torch.from_numpy(XVal.astype(np.float32))
    yValTensor = torch.from_numpy(yVal.astype(np.int64))

    # Create data loaders
    trainDataset = TensorDataset(XTrainTensor, yTrainTensor)
    trainLoader = DataLoader(trainDataset, batch_size = 32, shuffle = True, num_workers = 0, pin_memory = False)

    # Initialize MLP Deep Learning Model
    model = MLP().to(device)

    # Use weighted loss to handle class imbalance
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr = 0.001)

    # Training parameters
    epochs = 10
    bestHateF1 = 0.0
    bestModelState = None

    print("Training MLP Deep Learning Model...")

    # Training Phase
    for epoch in range(epochs):
        model.train()
        totalLoss = 0.0

        for batchX, batchY in tqdm(trainLoader, desc = f"Epoch {epoch + 1}/{epochs}"):
            batchX, batchY = batchX.to(device), batchY.to(device)

            optimizer.zero_grad()
            outputs = model(batchX)
            loss = criterion(outputs, batchY)
            loss.backward()
            optimizer.step()

            totalLoss += loss.item()
    
        # Validation Phase
        model.eval()
        with torch.no_grad():
            valOutputs = model(XValTensor.to(device))
            _, valPreds = torch.max(valOutputs, 1)
            valPredsNP = valPreds.cpu().numpy()

        # Calculate F1 Scores
        F1Macro = f1_score(yVal, valPredsNP, average = "macro")
        F1Hate = f1_score(yVal, valPredsNP, pos_label = 1)
        F1NonHate = f1_score(yVal, valPredsNP, pos_label = 0)

        accuracy = (valPredsNP == yVal).mean()

        print(f"Epoch {epoch + 1}: Loss = {totalLoss/len(trainLoader):.3f}, "
            f"Accuracy = {accuracy:.4f}, F1Hate = {F1Hate:.4f}, F1NonHate = {F1NonHate:.4f}, "
            f"F1Macro = {F1Macro:.4f}")

        # Track best model based on hate F1 score
        if F1Hate > bestHateF1:
            bestHateF1 = F1Hate
            bestHateF1Epoch = epoch
            bestModelState = model.state_dict().copy()

    # Load best model for final evaluation
    if bestModelState is not None:
        model.load_state_dict(bestModelState)
        print(f"\nLoaded Best Model with Hate F1 Score: {bestHateF1:.4f} at Epoch {bestHateF1Epoch + 1}")

    model.eval()

    with torch.no_grad():
        finalOutputs = model(XValTensor.to(device))
        _, finalPreds = torch.max(finalOutputs, 1)
        finalPredsNP = finalPreds.cpu().numpy()

    # Print results
    print(f"\nFinal Results after {epochs} Epochs\n")

    finalAccuracy = (finalPredsNP == yVal).mean()
    print(f"Final Accuracy: {finalAccuracy:.4f}")

    print("Final Classification Report:")
    print(classification_report(yVal, finalPredsNP, target_names = ["Non-Hate", "Hate"]))

    print("Final Confusion Matrix:")
    print(confusion_matrix(yVal, finalPredsNP))

    F1HateFinal = f1_score(yVal, finalPredsNP, pos_label = 1)
    F1NonHateFinal = f1_score(yVal, finalPredsNP, pos_label = 0)
    F1MacroFinal = f1_score(yVal, finalPredsNP, average = "macro")

    print(f"\nFinal F1 Scores:")
    print(f"    Hate Speech F1: {F1HateFinal:.4f}")
    print(f"    Non-Hate Speech F1: {F1NonHateFinal:.4f}")
    print(f"    Macro F1: {F1MacroFinal:.4f}")

    # Save the best model
    modelSavePath = f"bestMLPModel.pth"
    torch.save({
        "model_state_dict": model.state_dict(),
        "model_architecture": {
            "inputDim": 768,
            "hiddenDim": 256,
            "numClasses": 2,
            "dropoutRate": 0.3
        },
        "best_hate_f1": bestHateF1,
        "final_metrics": {
            "accuracy": finalAccuracy,
            "hate_f1": F1HateFinal,
            "non_hate_f1": F1NonHateFinal,
            "macro_f1": F1MacroFinal
        }
    }, modelSavePath)

    print(f"\nBest Model saved to: {modelSavePath}")
    print(f"To load this model later, use:")
    print(f"    checkpoint = torch.load(\"{modelSavePath}\")")
    print(f"    model = MLP(**checkpoint['model_architecture'])")
    print(f"    model.load_state_dict(checkpoint['model_state_dict'])")

    return model, finalPredsNP

print("MLP Deep Learning Model - SMOTE Balanced\n")
model, preds = trainMLP(XTrainSMOTE, yTrainSMOTE, XVal, yVal)

MLP Deep Learning Model - SMOTE Balanced

Training MLP Deep Learning Model...


Epoch 1/10: 100%|██████████| 1487/1487 [00:04<00:00, 355.10it/s]


Epoch 1: Loss = 0.275, Accuracy = 0.9024, F1Hate = 0.5378, F1NonHate = 0.9454, F1Macro = 0.7416


Epoch 2/10: 100%|██████████| 1487/1487 [00:04<00:00, 358.12it/s]


Epoch 2: Loss = 0.197, Accuracy = 0.8889, F1Hate = 0.5130, F1NonHate = 0.9373, F1Macro = 0.7252


Epoch 3/10: 100%|██████████| 1487/1487 [00:04<00:00, 341.77it/s]


Epoch 3: Loss = 0.146, Accuracy = 0.9199, F1Hate = 0.5733, F1NonHate = 0.9558, F1Macro = 0.7646


Epoch 4/10: 100%|██████████| 1487/1487 [00:04<00:00, 359.94it/s]


Epoch 4: Loss = 0.114, Accuracy = 0.9341, F1Hate = 0.5987, F1NonHate = 0.9641, F1Macro = 0.7814


Epoch 5/10: 100%|██████████| 1487/1487 [00:04<00:00, 327.57it/s]


Epoch 5: Loss = 0.091, Accuracy = 0.9360, F1Hate = 0.6064, F1NonHate = 0.9652, F1Macro = 0.7858


Epoch 6/10: 100%|██████████| 1487/1487 [00:04<00:00, 366.01it/s]


Epoch 6: Loss = 0.072, Accuracy = 0.9366, F1Hate = 0.5962, F1NonHate = 0.9656, F1Macro = 0.7809


Epoch 7/10: 100%|██████████| 1487/1487 [00:04<00:00, 352.15it/s]


Epoch 7: Loss = 0.058, Accuracy = 0.9377, F1Hate = 0.6113, F1NonHate = 0.9662, F1Macro = 0.7887


Epoch 8/10: 100%|██████████| 1487/1487 [00:04<00:00, 361.03it/s]


Epoch 8: Loss = 0.049, Accuracy = 0.9432, F1Hate = 0.6033, F1NonHate = 0.9694, F1Macro = 0.7863


Epoch 9/10: 100%|██████████| 1487/1487 [00:04<00:00, 312.12it/s]


Epoch 9: Loss = 0.043, Accuracy = 0.9496, F1Hate = 0.6265, F1NonHate = 0.9730, F1Macro = 0.7997


Epoch 10/10: 100%|██████████| 1487/1487 [00:04<00:00, 345.19it/s]


Epoch 10: Loss = 0.039, Accuracy = 0.9376, F1Hate = 0.6014, F1NonHate = 0.9661, F1Macro = 0.7838

Loaded Best Model with Hate F1 Score: 0.6265 at Epoch 9

Final Results after 10 Epochs

Final Accuracy: 0.9376
Final Classification Report:
              precision    recall  f1-score   support

    Non-Hate       0.97      0.96      0.97      5937
        Hate       0.55      0.66      0.60       456

    accuracy                           0.94      6393
   macro avg       0.76      0.81      0.78      6393
weighted avg       0.94      0.94      0.94      6393

Final Confusion Matrix:
[[5693  244]
 [ 155  301]]

Final F1 Scores:
    Hate Speech F1: 0.6014
    Non-Hate Speech F1: 0.9661
    Macro F1: 0.7838

Best Model saved to: bestMLPModel.pth
To load this model later, use:
    checkpoint = torch.load("bestMLPModel.pth")
    model = MLP(**checkpoint['model_architecture'])
    model.load_state_dict(checkpoint['model_state_dict'])
