In [1]:
import os
import pandas as pd
import numpy as np
import transformers
import torch
import matplotlib.pyplot as plt
import time
import datetime

from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split

from utilize import gen_dataframe

[nltk_data] Downloading package omw-1.4 to /home/xdy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /home/xdy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/xdy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print('GPU:', torch.cuda.device_count(), torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")

device

GPU: 1 NVIDIA GeForce RTX 2080


device(type='cuda')

In [3]:
df = gen_dataframe('./data/')
df.head()

Unnamed: 0,date,tweet,cleaned_tweet
0,2022-03-07 16:21:36,@MinMinHugs @flat__stanley @mimi_soapbox @mrma...,aunt heart attack survived sad dont course thi...
1,2022-03-07 15:47:58,@CBCToronto @fordnation &amp; @celliottability...,decision racially demeaning asian canadian bel...
2,2022-03-07 14:55:50,Dr. Simone Gold #TrumpTerrorist #AntiVaxxer #H...,simone gold 'guilty f'ck' trespassing inside c...
3,2022-03-07 11:54:39,Another #antivaxxer who keeps whining about no...,another keep whining nothing cmon relax enjoy ...
4,2022-03-07 05:08:13,@KyleJGlen And yet according to the #antivaxxe...,yet according freedumb idiot isnt happening ni...


In [4]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download([
    "names",
    "stopwords",
    "state_union",
    "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt",
])

[nltk_data] Downloading package names to /home/xdy/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /home/xdy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to /home/xdy/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/xdy/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/xdy/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/xdy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/xdy/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date

True

In [5]:
# pseudo-labelling

def assign_labels(df):
    labels = []
    sia = SentimentIntensityAnalyzer()
    for _, row in df.iterrows():
        scores = sia.polarity_scores(row['tweet'])
        label = 0
        if scores['pos'] < scores['neg']:
            label = 1
        if scores['neg'] == 1:
            label = 1
        labels.append(label)
    df['label'] = labels
    return df

df = assign_labels(df)

print(df.head())
print(df.groupby(['label']).count())

                  date                                              tweet  \
0  2022-03-07 16:21:36  @MinMinHugs @flat__stanley @mimi_soapbox @mrma...   
1  2022-03-07 15:47:58  @CBCToronto @fordnation &amp; @celliottability...   
2  2022-03-07 14:55:50  Dr. Simone Gold #TrumpTerrorist #AntiVaxxer #H...   
3  2022-03-07 11:54:39  Another #antivaxxer who keeps whining about no...   
4  2022-03-07 05:08:13  @KyleJGlen And yet according to the #antivaxxe...   

                                       cleaned_tweet  label  
0  aunt heart attack survived sad dont course thi...      0  
1  decision racially demeaning asian canadian bel...      1  
2  simone gold 'guilty f'ck' trespassing inside c...      1  
3  another keep whining nothing cmon relax enjoy ...      1  
4  yet according freedumb idiot isnt happening ni...      0  
        date  tweet  cleaned_tweet
label                             
0      15065  15065          15065
1       9911   9911           9911


In [6]:
X = df['cleaned_tweet'].values
y = df['label'].values

In [7]:
# BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
print('Original:', X[0])
print('Tokenized:', tokenizer.tokenize(X[0]))
print('Token IDs:', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(X[0])))

Original: aunt heart attack survived sad dont course thing happen life unfortunately love though fit warped mind never called way
Tokenized: ['aunt', 'heart', 'attack', 'survived', 'sad', 'don', '##t', 'course', 'thing', 'happen', 'life', 'unfortunately', 'love', 'though', 'fit', 'warped', 'mind', 'never', 'called', 'way']
Token IDs: [5916, 2540, 2886, 5175, 6517, 2123, 2102, 2607, 2518, 4148, 2166, 6854, 2293, 2295, 4906, 25618, 2568, 2196, 2170, 2126]


In [9]:
# find maximum length
max_seq_len = 0
for text in X:
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    max_seq_len = max(max_seq_len, len(input_ids))
print(max_seq_len)

67


In [10]:
# tokenize all text and map the tokens to word IDs
input_ids = []
attn_masks = []

for text in X:
    encoded_dict = tokenizer.encode_plus(text,
                                         add_special_tokens=True, # add [CLS] & [SEP]
                                         max_length=max_seq_len,
                                         pad_to_max_length=True, # pad 0 up to max length
                                         return_attention_mask=True,
                                         return_tensors='pt')
    input_ids.append(encoded_dict['input_ids'])
    attn_masks.append(encoded_dict['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attn_masks = torch.cat(attn_masks, dim=0)
labels = torch.tensor(y)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# dataset = TensorDataset(input_ids, attn_masks, labels)

# # train test split
# train_size = int(len(dataset) * 0.8)
# test_size = len(dataset) - train_size
# train_set, test_set = random_split(dataset, [train_size, test_size])

# # train val split
# val_size = int(train_size * 0.25)
# train_size = train_size - val_size
# train_set, val_set = random_split(train_set, [train_size, val_size])

# (len(train_set), len(test_set), len(val_set))

In [None]:
dataset = TensorDataset(input_ids, attn_masks, labels)

# training validation split
train_size = int(len(dataset) * 0.8)
val_size = len(dataset) - train_size
train_set, val_set = random_split(dataset, [train_size, val_size])

(len(train_set), len(val_set))

In [12]:
batch_size = 32
train_dataloader = DataLoader(train_set,
                              sampler=RandomSampler(train_set),
                              batch_size=batch_size)
val_dataloader = DataLoader(val_set,
                            sampler=SequentialSampler(val_set),
                            batch_size=batch_size)

In [13]:
# train classification model
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
bert_model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [14]:
def format_time(t):
    time_rounded = int(round((t)))
    # hh:mm:ss
    return str(datetime.timedelta(seconds=time_rounded))

In [15]:
lr = 2e-5
eps = 1e-8
n_epochs = 2

save_model_path = 'model/save'
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=lr, eps=eps)

In [None]:
# start training 

train_stats = {'Epoch':[], 'Loss':[], 'Accuracy':[]}
test_stats = {'Epoch':[], 'Loss':[], 'Accuracy':[]}

for epoch in range(n_epochs):
    start_time = time.time()

    print(f'Epoch:{epoch+1} / {n_epochs}')
    print('Training...')
    
    bert_model.train()
    loss = 0
    losses = []
    accuracy = 0
    accuracies = []
    
    for step, batch in enumerate(train_dataloader):
        if step % 100 == 0 and not step == 0:
            print(f'Batch {step} / {len(train_dataloader)}, time cost: {format_time(time.time() - start_time)}')

        batch_ids = batch[0].to(device)
        batch_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        bert_model.zero_grad()

        """
        https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertForSequenceClassification

        Returns

        loss (torch.FloatTensor of shape (1,), optional, returned when labels is provided) — Classification (or regression if config.num_labels==1) loss.

        logits (torch.FloatTensor of shape (batch_size, config.num_labels)) — Classification (or regression if config.num_labels==1) scores (before SoftMax).

        hidden_states (tuple(torch.FloatTensor), optional, returned when output_hidden_states=True is passed or when config.output_hidden_states=True) — Tuple of torch.FloatTensor (one for the output of the embeddings + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size).

        Hidden-states of the model at the output of each layer plus the initial embedding outputs.

        attentions (tuple(torch.FloatTensor), optional, returned when output_attentions=True is passed or when config.output_attentions=True) — Tuple of torch.FloatTensor (one for each layer) of shape (batch_size, num_heads, sequence_length, sequence_length).
        """

        outputs = bert_model(batch_ids,
                             token_type_ids=None,
                             attention_mask=batch_masks,
                             labels=batch_labels)
        loss = outputs.loss        
        logits = outputs.logits

        losses.append(loss.item())
        logits = logits.detach().cpu().numpy()
        label_ids = batch_labels.to('cpu').numpy()

        accuracy = np.sum(np.argmax(logits, axis=1) == label_ids) / len(label_ids)
        accuracies.append(accuracy)

        loss.backward()
        optimizer.step()
    
    train_stats['Epoch'].append(epoch+1)
    train_stats['Loss'].append(np.mean(losses))
    train_stats['Accuracy'].append(np.mean(accuracies))

    print('Total time cost for training :', format_time(time.time() - start_time))

    # torch.save(bert_model.state_dict(), save_model_path)

    print()
    print('Validating...')

    bert_model.eval()
    val_loss = 0
    val_losses = []
    val_accuracy = 0
    val_accuracies = []
    # n_val_steps = 0

    for batch in val_dataloader:
        batch_ids = batch[0].to(device)
        batch_masks = batch[1].to(device)
        batch_labels = batch[2].to(device)

        with torch.no_grad():
            outputs = bert_model(batch_ids,
                                 token_type_ids=None,
                                 attention_mask=batch_masks,
                                 labels=batch_labels)
            val_loss = outputs.loss
            val_logits = outputs.logits
            
            val_losses.append(val_loss.item())
            val_logits = val_logits.detach().cpu().numpy()
            val_label_ids = batch_labels.to('cpu').numpy()

            val_accuracy = np.sum(np.argmax(val_logits, axis=1) == val_label_ids) / len(val_label_ids)
            val_accuracies.append(val_accuracy)

    test_stats['Epoch'].append(epoch+1)
    test_stats['Loss'].append(np.mean(val_losses))
    test_stats['Accuracy'].append(np.mean(val_accuracies))

    print()
    print('Finished Training.')
    print()


In [None]:
# train_stats

In [None]:
# test_stats

In [None]:
# plt.plot()

In [None]:
# evaluation on test dataset

# bert_model.eval()