The code below imports the necessary libraries for this project.

In [1]:
!pip install pandas numpy scikit-learn transformers torch xgboost tqdm

import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

from transformers import BertTokenizer, BertModel
import torch
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import random
from tqdm import tqdm



The code below loads the data and cleans it.

In [2]:
# Generate random seeds for reporductibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Load the data
trainDf = pd.read_csv("train_E6oV3lV.csv")
testDf = pd.read_csv("test_tweets_anuFYb8.csv")

# Function for cleaning the data
def cleanTweet(tweet):
    tweet = tweet.lower()
    tweet = tweet.replace("@user", "")
    tweet = tweet.replace("#", "")
    tweet = re.sub(r"http\S+", "", tweet)
    tweet = re.sub(r"[^a-z0-9\s]", "", tweet)
    tweet = re.sub(r"\s+", " ", tweet).strip()
    return tweet

# Clean the data
trainDf["cleanTweet"] = trainDf["tweet"].apply(cleanTweet)
testDf["cleanTweet"] = testDf["tweet"].apply(cleanTweet)

trainDf[["tweet", "cleanTweet"]].head()

Unnamed: 0,tweet,cleanTweet
0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,factsguide: society now #motivation,factsguide society now motivation


The code below implements the BERT form of featurization.

In [3]:
# Set seed for reproducibility
torch.manual_seed(42)

# Use MPS if available, fallback to CPU (Created for MacOS)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased").to(device)
model.eval()

# Use all tweets and labels from trainDf
tweets = trainDf["cleanTweet"].tolist()
labels = trainDf["label"].values

# Custom dataset
class TweetDataset(Dataset):
    def __init__(self, texts, tokenizer, maxLen = 64):
        self.texts = texts
        self.tokenizer = tokenizer
        self.maxLen = maxLen

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation = True,
            padding = "max_length",
            max_length = self.maxLen,
            return_tensors = "pt"
        )
        return {key: val.squeeze(0) for key, val in encoding.items()}

# Instantiate dataset and dataloader
dataset = TweetDataset(tweets, tokenizer)
loader = DataLoader(dataset, batch_size = 32)

# Collect embeddings
allEmbeddings = []

# Generate [CLS] emebeddings batch-by-batch and store them
with torch.no_grad():
    for batch in tqdm(loader, desc = "Generating BERT Embeddings"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        clsEmbeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        allEmbeddings.append(clsEmbeddings)

# Stack to final matrix
Xbert = np.vstack(allEmbeddings)
print(f"Shape of Final Embedding Matrix: {Xbert.shape}")


Generating BERT Embeddings: 100%|██████████| 999/999 [09:38<00:00,  1.73it/s]  

Shape of Final Embedding Matrix: (31962, 768)





The code below implements the Linear Model family, Ensemble Model family, and Boosting Model family.

In [4]:
# Prepare target labels
ybert = labels

# Split into train and validation sets
XTrain, Xval, yTrain, yVal = train_test_split(Xbert, ybert, test_size = 0.2, random_state = 42)

# Linear Model: Logistic Regression
logReg = LogisticRegression(max_iter = 1000, class_weight = "balanced")
logReg.fit(XTrain, yTrain)
yPredLogReg = logReg.predict(Xval)
print(f"Logistic Regression Accuracy: {accuracy_score(yVal, yPredLogReg)}")
print(classification_report(yVal, yPredLogReg))

# Ensemble Model: Random Forest
rf = RandomForestClassifier(n_estimators = 100, random_state = 42, class_weight = "balanced")
rf.fit(XTrain, yTrain)
yPredRF = rf.predict(Xval)
print(f"Random Forest Accuracy: {accuracy_score(yVal, yPredRF)}")
print(classification_report(yVal, yPredRF))

# Calculate scale position weight parameter to balanced class weights
counter = Counter(yTrain)
scalePosWeight = counter[0] / counter[1]

# Boosting Model: XGBoost
xgb = XGBClassifier(eval_metric = "logloss", random_state = 42, scale_pos_weight = scalePosWeight)
xgb.fit(XTrain, yTrain)
yPredXGB = xgb.predict(Xval)
print(f"XGBoost Accuracy: {accuracy_score(yVal, yPredXGB)}")
print(classification_report(yVal, yPredXGB))

Logistic Regression Accuracy: 0.87533239480682
              precision    recall  f1-score   support

           0       0.98      0.88      0.93      5937
           1       0.34      0.81      0.48       456

    accuracy                           0.88      6393
   macro avg       0.66      0.84      0.70      6393
weighted avg       0.94      0.88      0.90      6393

Random Forest Accuracy: 0.9486938839355545
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5937
           1       0.93      0.30      0.46       456

    accuracy                           0.95      6393
   macro avg       0.94      0.65      0.72      6393
weighted avg       0.95      0.95      0.94      6393

XGBoost Accuracy: 0.9540122008446739
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      5937
           1       0.75      0.53      0.62       456

    accuracy                           0.95      6393
 