The code below imports the necessary libraries for this project.

In [1]:
!pip install pandas numpy scikit-learn transformers torch xgboost tqdm joblib

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import random
from tqdm import tqdm
import joblib



The code below loads the data and cleans it.

In [2]:
# Generate random seeds for reporductibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Load the data
trainDf = pd.read_csv("train_E6oV3lV.csv")
testDf = pd.read_csv("test_tweets_anuFYb8.csv")

# Function for cleaning the data
def cleanTweet(tweet):
    tweet = tweet.lower()
    tweet = tweet.replace("@user", "")
    tweet = tweet.replace("#", "")
    tweet = re.sub(r"http\S+", "", tweet)
    tweet = re.sub(r"[^a-z0-9\s]", "", tweet)
    tweet = re.sub(r"\s+", " ", tweet).strip()
    return tweet

# Clean the data
trainDf["cleanTweet"] = trainDf["tweet"].apply(cleanTweet)
testDf["cleanTweet"] = testDf["tweet"].apply(cleanTweet)

trainDf[["tweet", "cleanTweet"]].head()

Unnamed: 0,tweet,cleanTweet
0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,factsguide: society now #motivation,factsguide society now motivation


The code below implements the BERT form of featurization. We use BERT emebeddings since it understand contextual meaning, informal language and slang usage more efficiently than TF-IDF Vectorization. 

In [3]:
# Set seed for reproducibility
torch.manual_seed(42)

# Use MPS if available, fallback to CPU (Created for MacOS)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bertModel = BertModel.from_pretrained("bert-base-uncased").to(device)
bertModel.eval()

# Use all tweets and labels from trainDf
tweets = trainDf["cleanTweet"].tolist()
labels = trainDf["label"].values

# Custom dataset
class TweetDataset(Dataset):
    def __init__(self, texts, tokenizer, maxLen = 64):
        self.texts = texts
        self.tokenizer = tokenizer
        self.maxLen = maxLen

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation = True,
            padding = "max_length",
            max_length = self.maxLen,
            return_tensors = "pt"
        )
        return {key: val.squeeze(0) for key, val in encoding.items()}

# Instantiate dataset and dataloader
dataset = TweetDataset(tweets, tokenizer)
loader = DataLoader(dataset, batch_size = 32)

# Collect embeddings
allEmbeddings = []

# Generate [CLS] emebeddings batch-by-batch and store them
with torch.no_grad():
    for batch in tqdm(loader, desc = "Generating BERT Embeddings"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = bertModel(**batch)
        clsEmbeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        allEmbeddings.append(clsEmbeddings)

# Stack to final matrix
Xbert = np.vstack(allEmbeddings)
print(f"Shape of Final Embedding Matrix: {Xbert.shape}")


Generating BERT Embeddings: 100%|██████████| 999/999 [08:29<00:00,  1.96it/s]

Shape of Final Embedding Matrix: (31962, 768)





The code below implements the Boosting Model family, or XGBoost, since the results were more accurate, specifically 95.4%, compared to the rest.

In [4]:
# Prepare target labels
ybert = labels

# Split into train and validation sets
XTrain, Xval, yTrain, yVal = train_test_split(Xbert, ybert, test_size = 0.2, random_state = 42)

# Calculate scale position weight parameter to balanced class weights
# CHANGED FROM COUNTER TO SAMPLE WEIGHT
from sklearn.utils.class_weight import compute_sample_weight
sampleWeights = compute_sample_weight(class_weight = "balanced", y = yTrain)

# Boosting Model: XGBoost
xgb = XGBClassifier(
    n_estimators = 300,
    max_depth = 6,
    learning_rate = 0.05,
    subsample = 0.8,
    colsample_bytree = 0.8,
    eval_metric = "logloss",
    random_state = 42
)

xgb.fit(XTrain, yTrain,
        eval_set = [(Xval, yVal)],
        early_stopping_rounds = 10,
        verbose = True,
        sample_weight = sampleWeights
)

yPredXGB = xgb.predict(Xval)

# EXTRA
from sklearn.metrics import f1_score, confusion_matrix, classification_report
print(classification_report(yVal, yPredXGB, digits = 4))
print("Confusion Matrix:\n", confusion_matrix(yVal, yPredXGB))

print(f"XGBoost Accuracy: {accuracy_score(yVal, yPredXGB)}")
print(classification_report(yVal, yPredXGB))

TypeError: XGBClassifier.fit() got an unexpected keyword argument 'early_stopping_rounds'

The code below saves the model for future testing use.

In [None]:
# Save the model for future testing
joblib.dump(xgb, "finalModel.pkl")

['finalModel.pkl']

The code below loads the model and tests it on the testing file.

In [None]:
# Load the trained model
xgbModel = joblib.load("finalModel.pkl")

# Tokenize the test tweets
testTweets = testDf["cleanTweet"].tolist()
testDataset = TweetDataset(testTweets, tokenizer)
testLoader = DataLoader(testDataset, batch_size = 32)

# Generate BERT [CLS] embeddings for test data using modelBERT
testEmbeddings = []

with torch.no_grad():
    for batch in tqdm(testLoader, desc = "Generating Test Embeddings"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = bertModel(**batch)
        clsEmbeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        testEmbeddings.append(clsEmbeddings)

XTestFinal = np.vstack(testEmbeddings)

# Predict using the loaded classifier
yPredTest = xgbModel.predict(XTestFinal)

# Add predictions to the test dataframe
testDf["predictedLabel"] = yPredTest

# Export the results to a CSV file
testDf[["id", "predictedLabel"]].to_csv("testPredictions.csv", index = False)

Generating Test Embeddings: 100%|██████████| 538/538 [05:11<00:00,  1.72it/s]


The code below provides a function for the trained model to predict on user inputs.

In [None]:
def predict(tweet, bertModel, xgbModel):
    # Function for cleaning the data
    def cleanTweet(tweet):
        tweet = tweet.lower()
        tweet = tweet.replace("@user", "")
        tweet = tweet.replace("#", "")
        tweet = re.sub(r"http\S+", "", tweet)
        tweet = re.sub(r"[^a-z0-9\s]", "", tweet)
        tweet = re.sub(r"\s+", " ", tweet).strip()
        return tweet
    
    # Clean the tweet
    cleanedTweet = cleanTweet(tweet)

    # Tokenize and prepare input for BERT
    encoding = tokenizer(
        cleanedTweet,
        truncation = True,
        padding = "max_length",
        max_length = 64,
        return_tensors = "pt"
    )
    encoding = {k: v.to(device) for k, v in encoding.items()}

    # Generate BERT [CLS] embedding
    with torch.no_grad():
        outputs = bertModel(**encoding)
        clsEmbedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    
    # Use trained XGBoost model to predict
    prediction = xgbModel.predict(clsEmbedding)

    # Return predicted label
    return prediction[0]

def classifyTweet(tweet, bertModel, xgbModel):
    labelMap = {0: "Non-Hate Speech", 1: "Hate Speech"}
    print(f"Tweet: {tweet}\nPrediction: {labelMap[predict(tweet, bertModel, xgbModel)]}\n")


classifyTweet("I hate you and everything you stand for.", bertModel, xgbModel)
classifyTweet("I love spending time with my family.", bertModel, xgbModel)
classifyTweet("Go back to where you came from.", bertModel, xgbModel)
classifyTweet("What a beautiful day it is today!", bertModel, xgbModel)
classifyTweet("Kill all the terrorists.", bertModel, xgbModel)
classifyTweet("Happy birthday! Hope you have a great day.", bertModel, xgbModel)

Tweet: I hate you and everything you stand for.
Prediction: Non-Hate Speech

Tweet: I love spending time with my family.
Prediction: Non-Hate Speech

Tweet: Go back to where you came from.
Prediction: Non-Hate Speech

Tweet: What a beautiful day it is today!
Prediction: Non-Hate Speech

Tweet: Kill all the terrorists.
Prediction: Non-Hate Speech

Tweet: Happy birthday! Hope you have a great day.
Prediction: Non-Hate Speech

Training set label counts: Counter({1: 23783, 0: 23783})
Validation set label counts: Counter({0: 5937, 1: 456})
