In [2]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import re

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Load data
file_path = "D:\\OneDrive - University of South Carolina\\Research\\CHQ_Twitter\\dataset\\0707_all_data\\SC_tweets.xlsx"
data = pd.read_excel(file_path, engine='openpyxl')

In [4]:
output_file_path = "D:\\OneDrive - University of South Carolina\\Research\\CHQ_Twitter\\dataset\\0707_all_data\\SC_tweets.feather"
data.to_feather(output_file_path)

In [4]:
data.iloc[0:6, :]

Unnamed: 0,tweetid,userid,username,postdate,message,longitude,latitude
0,810979387847819264,555790283,BrownRtbrown,2016-12-19 22:45:27,@DanaPerino @PressSec @TheFive national treasu...,-80.926628,33.631138
1,810979516986392576,335937722,GinaaCocky_AF,2016-12-19 22:45:58,we'll jump that nigga,-81.63279,34.724121
2,810979872885665792,2763980290,longlivehotd,2016-12-19 22:47:23,Nigga just want to feel real love,-80.926628,33.631138
3,810980766268227584,120659589,LabelMeTwiix,2016-12-19 22:50:56,She made a nigga buss two times off the head m...,-79.780312,34.182663
4,765917393021440000,534912338,Nierrraaa,2016-08-17 14:25:11,yall niggas made us this way https://t.co/ZcO...,-80.926628,33.631138
5,765917458020589568,3384671872,MissTurnerFMS,2016-08-17 14:25:26,Who can build the longest chain out of one pie...,-80.926628,33.631138


In [5]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               # u"\U0001F600-\U0001F64F"  # emojicons
                               # u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               # u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               # u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               # u"\U00002500-\U00002BEF"  # chinese char
                               # u"\U00002702-\U000027B0"
                               # u"\U00002702-\U000027B0"
                               # u"\U000024C2-\U0001F251"
                               # u"\U0001f926-\U0001f937"
                               # u"\U00010000-\U0010ffff"
                               # u"\u2640-\u2642"
                               # u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

def clean_twts(tw):
  # remove urls
  pattern = 'https{0,1}:\/\/t.co\/[a-zA-Z0-9]+'
  tw = re.sub(pattern, "", tw)
  # remove @
  pattern = '@[a-zA-Z0-9_]+ '
  tw = re.sub(pattern, "", tw)
  tw = remove_emoji(tw)
  return tw

def clean_twt_row(row):
    return clean_twts(row['message'])

In [6]:
# data cleaning
data['message_cleaned'] = data.apply(clean_twt_row, axis=1)
data.iloc[0:6, :]

Unnamed: 0,tweetid,userid,username,postdate,message,longitude,latitude,message_cleaned
0,810979387847819264,555790283,BrownRtbrown,2016-12-19 22:45:27,@DanaPerino @PressSec @TheFive national treasu...,-80.926628,33.631138,national treasure is the movie
1,810979516986392576,335937722,GinaaCocky_AF,2016-12-19 22:45:58,we'll jump that nigga,-81.63279,34.724121,we'll jump that nigga
2,810979872885665792,2763980290,longlivehotd,2016-12-19 22:47:23,Nigga just want to feel real love,-80.926628,33.631138,Nigga just want to feel real love
3,810980766268227584,120659589,LabelMeTwiix,2016-12-19 22:50:56,She made a nigga buss two times off the head m...,-79.780312,34.182663,She made a nigga buss two times off the head m...
4,765917393021440000,534912338,Nierrraaa,2016-08-17 14:25:11,yall niggas made us this way https://t.co/ZcO...,-80.926628,33.631138,yall niggas made us this way
5,765917458020589568,3384671872,MissTurnerFMS,2016-08-17 14:25:26,Who can build the longest chain out of one pie...,-80.926628,33.631138,Who can build the longest chain out of one pie...


In [7]:
# BERT setup
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [8]:
def tokenize_and_encode(text):
    input_ids = tokenizer.encode(text, return_tensors='pt', truncation=True, max_length=512)
    return input_ids

def generate_features(text):
    input_ids = tokenize_and_encode(text)
    input_ids = input_ids.to(device)
    with torch.no_grad():
        outputs = model(input_ids)
    # The last_hidden_state contains the contextualized word embeddings
    last_hidden_state = outputs.last_hidden_state
    # To obtain a single feature vector for the input text, you can take the mean of the last_hidden_state
    features = last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return features

In [12]:
chunk_size = 5000
all_features_df = pd.DataFrame()
temp_features = []

# Splitting the data into chunks of 10,000 rows
num_chunks = len(data) // chunk_size + (1 if len(data) % chunk_size else 0)

print(num_chunks)

for chunk_idx in range(num_chunks):

    start_idx = chunk_idx * chunk_size
    end_idx = start_idx + chunk_size
    
    chunk_data = data.iloc[start_idx:end_idx]
    
    features = pd.DataFrame(chunk_data['message_cleaned'].apply(generate_features).tolist())
    features.columns = ['BERT' + str(i + 1) for i in range(features.shape[1])]

    # Appending the features DataFrame of the current chunk to the all_features list
    temp_features.append(features)

    if chunk_idx % 5 == 0:
        print(chunk_idx)
        temp_features_df = pd.concat(temp_features, axis=0, ignore_index=True)
        all_features_df = pd.concat([all_features_df, temp_features_df], axis=0, ignore_index=True)
        # all_features_df.to_excel(output_file_path, engine='openpyxl', index=False)
        temp_features = []

73
0
5
10
15
20
25
30
35
40
45
50
55
60
65
70


In [14]:
temp_features_df = pd.concat(temp_features, axis=0, ignore_index=True)
all_features_df = pd.concat([all_features_df, temp_features_df], axis=0, ignore_index=True)

In [18]:
# Concatenating all the feature DataFrames to get a single DataFrame
# all_features_df = pd.concat(all_features, axis=0, ignore_index=True)
all_features_df.shape

(362627, 768)

In [19]:
all_features_df.iloc[0:5]

Unnamed: 0,BERT1,BERT2,BERT3,BERT4,BERT5,BERT6,BERT7,BERT8,BERT9,BERT10,...,BERT759,BERT760,BERT761,BERT762,BERT763,BERT764,BERT765,BERT766,BERT767,BERT768
0,0.266411,-0.271815,-0.04684,0.106725,0.267258,-0.424704,0.172865,0.072938,-0.143964,-0.004774,...,0.122944,-0.144947,0.224371,-0.25405,-0.213241,0.142444,0.346553,0.161358,0.220109,0.044384
1,0.738933,0.419877,0.307914,0.11002,0.093325,-0.196112,0.423407,0.683323,-0.084137,0.004366,...,-0.1986,-0.241656,-0.201124,-0.098977,-0.060928,-0.241914,-0.060914,-0.450824,0.116016,-0.325667
2,0.415491,-0.067514,0.820043,0.097507,0.250365,-0.330574,0.843664,0.475462,-0.20337,-0.085514,...,0.234666,0.127815,0.316786,0.028316,-0.125629,0.204311,0.136583,-0.624107,-0.121403,-0.345228
3,0.181235,0.002691,0.360862,-0.154296,-0.192688,-0.140606,0.443567,0.865717,-0.200479,-0.001431,...,-0.240909,-0.129673,-0.065461,-0.064908,-0.304751,-0.271324,-0.198703,-0.506731,0.113161,-0.013145
4,0.458116,0.344774,0.167026,-0.168113,0.223548,-0.053159,0.655439,0.543069,-0.156358,-0.23711,...,-0.114369,-0.381655,-0.336731,-0.141827,-0.209108,-0.137198,0.023977,-0.359532,-0.070259,-0.100047


In [20]:
# Save to .csv file
output_file_path = "D:\\OneDrive - University of South Carolina\\Research\\CHQ_Twitter\\dataset\\0707_all_data\\BERT_features_SC_tweets.csv"
all_features_df.to_csv(output_file_path, index=False)

In [21]:
output_file_path = "D:\\OneDrive - University of South Carolina\\Research\\CHQ_Twitter\\dataset\\0707_all_data\\BERT_features_SC_tweets.feather"
all_features_df.to_feather(output_file_path)

In [23]:
input_file_path = "D:\\OneDrive - University of South Carolina\\Research\\CHQ_Twitter\\dataset\\0707_all_data\\BERT_features_SC_tweets.feather"
df_from_feather = pd.read_feather(input_file_path)