The code below imports the necessary libraries for this project.

In [1]:
!pip install pandas numpy scikit-learn transformers torch

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import BertTokenizer, BertModel
import torch



The code below loads the data and cleans it.

In [2]:
# Load the data
trainDf = pd.read_csv("train_E6oV3lV.csv")
testDf = pd.read_csv("test_tweets_anuFYb8.csv")

# Function for cleaning the data
def cleanTweet(tweet):
    tweet = tweet.lower()
    tweet = tweet.replace("@user", "")
    tweet = tweet.replace("#", "")
    tweet = re.sub(r"http\S+", "", tweet)
    tweet = re.sub(r"[^a-z0-9\s]", "", tweet)
    tweet = re.sub(r"\s+", " ", tweet).strip()
    return tweet

# Clean the data
trainDf["cleanTweet"] = trainDf["tweet"].apply(cleanTweet)
testDf["cleanTweet"] = testDf["tweet"].apply(cleanTweet)

trainDf[["tweet", "cleanTweet"]].head()

Unnamed: 0,tweet,cleanTweet
0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i cant use cause they d...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,factsguide: society now #motivation,factsguide society now motivation


The code below implements the TF-IDF Vectorization form of featurization.

In [3]:
# Initialize TF-IDF Vectorizer to ignore common English stop words
# and limit features to top 5000 words
tfidf = TfidfVectorizer(stop_words = "english", max_features = 5000)

# Fit the vectorizer on the cleaned tweets and transformt the text
# data into TF-IDF feature vectors
Xtfidf = tfidf.fit_transform(trainDf["cleanTweet"])

# Check dimensions of vectors: (# of tweets, # of features)
# For example, (31962, 5000)
print(Xtfidf.shape)

(31962, 5000)


The code below implements the BERT form of featurization.

In [None]:
# Load pre trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()

# Set device to GPU if available, otherwise use CPU,
# and move the BERT model to that device for faster computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Function get BERT [CLS] embedding for a text input as a NumPy array
def getBertEmbedding(text):
    inputs = tokenizer(text, return_tensors = "pt", truncation = True, padding = "max_length", max_length = 64)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
    
    clsEmbedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu()
    return clsEmbedding.numpy()

# Apply BERT [CLS] embedding to the cleaned tweets
# Due to large computational time, we will only use a random
# sample of 1% of the dataset.
subset = trainDf.sample(frac = 0.01, random_state = 42)
bertFeatures = subset["cleanTweet"].apply(getBertEmbedding)
Xbert = np.vstack(bertFeatures)

# Check dimensions of BERT embeddings matrix: (# of tweets, embedding size)
# For example, (31962, 768) or (320, 768) for a random 1% sample
print(Xbert.shape)

(320, 768)
