# Setup

In [45]:
# install transformers package from Hugging Face transformers package contains 
# pre-trained BERT model and other useful interfaces
!pip install transformers 



In [46]:
# Check if Colab's GPU is available and set up the GPU device
import tensorflow as tf
import torch

device_name = tf.test.gpu_device_name()

if device_name == '/device:GPU:0':
    print('GPU is available')
else:
    raise SystemError('No GPU device available')

if torch.cuda.is_available():
    device = torch.device("cuda")
    print('GPU name and type:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using CPU')
    device = torch.device("cpu")

GPU is available
GPU name and type: Tesla P4


# Data Preprocessing

In [74]:
# dataset pre-processing
# load data_extraction.ipynb
import pandas as pd
import numpy as np

iamc_ds = pd.read_json('iamc.json')

tweets = {}
labels = {}
count = 1

for annotator in iamc_ds:
    tweets['annotator' + str(count)] = np.array(list(iamc_ds[annotator][0]))
    labels['annotator' + str(count)] = np.array(list(iamc_ds[annotator][1]))
    count += 1 

print('example 1')
print('tweet: ', tweets['annotator1'][0])
print('label: ', labels['annotator1'][0])

print('')
print('example 2')
print('tweet: ', tweets['annotator3'][1])
print('label: ', labels['annotator3'][1])


example 1
tweet:  Wholeheartedly support these protests ; acts of civil disobedience ; will join when I can! #Ferguson #AllLivesMatter 
label:  [0 0 1 0 0 0 0 0 0 0 0]

example 2
tweet:  This Sandra Bland situation man no disrespect rest her soul , but people die everyday in a unjustified matter #AllLivesMatter
label:  [0 0 0 0 0 1 0 0 0 0 0]


# Training Prep

### Input Tokenization & Encoding

In [75]:
from transformers import BertTokenizer

print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# visualize BERT tokenization by an example
print('Before:', tweets['annotator1'][0])
print('After(words):', tokenizer.tokenize(tweets['annotator1'][0]))
print('After(ids):', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(tweets['annotator1'][0])))

Loading BERT tokenizer...
Before: Wholeheartedly support these protests ; acts of civil disobedience ; will join when I can! #Ferguson #AllLivesMatter 
After(words): ['whole', '##hearted', '##ly', 'support', 'these', 'protests', ';', 'acts', 'of', 'civil', 'di', '##so', '##bed', '##ience', ';', 'will', 'join', 'when', 'i', 'can', '!', '#', 'ferguson', '#', 'all', '##li', '##ves', '##mat', '##ter']
After(ids): [2878, 27693, 2135, 2490, 2122, 8090, 1025, 4490, 1997, 2942, 4487, 6499, 8270, 13684, 1025, 2097, 3693, 2043, 1045, 2064, 999, 1001, 11262, 1001, 2035, 3669, 6961, 18900, 3334]


In [78]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

train_data_loaders = {}
val_data_loaders = {}
test_data_loaders = {}

for annotator in tweets:
    # input tokenization
    tweets_one_anno = tweets[annotator]

    # Get the maximum length of the dataset since all input instances have to have a constant length
    max_len = 0

    for tweet in tweets_one_anno:
        max_len = max(max_len, len(tweet.split()))

    max_len += 2
    print(annotator)
    print('Max tweet length:', max_len)

    # tokenize all tweets, acquire corresponding token ids and attention masks
    input_ids = []
    attention_masks = []

    # encode input
    for tweet in tweets_one_anno:
        encoded_dict = tokenizer.encode_plus(tweet, 
                                            add_special_tokens=True, 
                                            max_length=max_len,
                                            pad_to_max_length=True,
                                            return_attention_mask=True,
                                            return_tensors='pt'
                                            )

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels_one_anno = torch.tensor(labels[annotator], dtype=torch.float)

    # split the dataset and generate dataloader
    dataset = TensorDataset(input_ids, attention_masks, labels_one_anno)

    # training 80% validation 10% testing 10%
    train_size = int(0.8 * len(dataset))
    val_size = int(0.1 * len(dataset))
    test_size = len(dataset) - train_size - val_size

    print('training dataset size:', train_size)
    print('validation dataset size:', val_size)
    print('testing dataset size:', test_size)
    print('')

    train_ds, val_ds, test_ds = random_split(dataset, [train_size, val_size, test_size])

    # create a dataloader
    batch_size = 32

    train_dl = DataLoader(train_ds, 
                        sampler=RandomSampler(train_ds), 
                        batch_size=batch_size)
    train_data_loaders[annotator] = train_dl

    val_dl = DataLoader(val_ds, 
                        sampler=SequentialSampler(val_ds), 
                        batch_size=batch_size)
    val_data_loaders[annotator] = val_dl

    test_dl = DataLoader(test_ds, 
                        sampler=SequentialSampler(test_ds), 
                        batch_size=batch_size)
    test_data_loaders[annotator] = test_dl
    


ERROR! Session/line number was not unique in database. History logging moved to new sessionannotator1
 61
Max tweet length: 33
training dataset size: 6282
validation dataset size: 785
testing dataset size: 786

annotator2
Max tweet length: 31
training dataset size: 5776
validation dataset size: 722
testing dataset size: 723

annotator3
Max tweet length: 33
training dataset size: 11582
validation dataset size: 1447
testing dataset size: 1449

annotator4
Max tweet length: 33
training dataset size: 9034
validation dataset size: 1129
testing dataset size: 1130

annotator5
Max tweet length: 29
training dataset size: 500
validation dataset size: 62
testing dataset size: 64

annotator6
Max tweet length: 33
training dataset size: 6394
validation dataset size: 799
testing dataset size: 800

annotator7
Max tweet length: 32
training dataset size: 731
validation dataset size: 91
testing dataset size: 92

annotator8
Max tweet length: 32
training dataset size: 3058
validation dataset size: 382
testi

### Building Multi-Label BERT Model

In [0]:
from transformers import BertModel, BertConfig, BertPreTrainedModel
from torch.nn import BCEWithLogitsLoss, Sigmoid

class BertForMultiLabelSequenceClassification(BertPreTrainedModel):
    """ 
    Bert for multi-label classification 
    """
    def __init__(self, config, num_labels=11):
        super(BertForMultiLabelSequenceClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, num_labels)

    
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask)
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        if labels is not None:
            new_loss = BCEWithLogitsLoss()
            loss = new_loss(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
            return loss, logits
        else:
            return logits

### Model Visualization

In [80]:
model = BertForMultiLabelSequenceClassification.from_pretrained("bert-base-uncased", 
                                                                 output_attentions=False, 
                                                                 output_hidden_states=False)

model.cuda()

#visualize
params = list(model.named_parameters())

for p in params:
    print("{:<60} {:>15}".format(p[0], str(tuple(p[1].size()))))


bert.embeddings.word_embeddings.weight                          (30522, 768)
bert.embeddings.position_embeddings.weight                        (512, 768)
bert.embeddings.token_type_embeddings.weight                        (2, 768)
bert.embeddings.LayerNorm.weight                                      (768,)
bert.embeddings.LayerNorm.bias                                        (768,)
bert.encoder.layer.0.attention.self.query.weight                  (768, 768)
bert.encoder.layer.0.attention.self.query.bias                        (768,)
bert.encoder.layer.0.attention.self.key.weight                    (768, 768)
bert.encoder.layer.0.attention.self.key.bias                          (768,)
bert.encoder.layer.0.attention.self.value.weight                  (768, 768)
bert.encoder.layer.0.attention.self.value.bias                        (768,)
bert.encoder.layer.0.attention.output.dense.weight                (768, 768)
bert.encoder.layer.0.attention.output.dense.bias                      (768,)

### Optimizer & Learning Rate Scheduler

In [0]:
from transformers import get_linear_schedule_with_warmup, AdamW

# set up the optimizer
optimizer = AdamW(model.parameters(),
                  lr=2e-5,
                  eps=1e-8)

# set up the lr scheduler
epochs = 3
total_steps = len(train_dl) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

### Helper Functions

In [0]:
import time
import datetime
from sklearn.metrics import hamming_loss, accuracy_score

def get_accuracy_hamming_loss(preds, labels):
    preds = preds >= 0.5

    return 1 - hamming_loss(preds, labels)

def get_accuracy_exact_match(preds, labels):
    preds = preds >= 0.5

    return accuracy_score(preds, labels)

# correct labels out of cases where prediction and labels are not both 0
def get_accuracy_none_zero(preds, labels):
    preds = preds >= 0.5

    total = []
    for i in range(len(preds)):
        suM = 0
        for j in range(len(preds[i])):
            correct = 0
            #correct
            if preds[i][j] == 1 and labels[i][j]== 1 :
                suM += 1
                correct += 1
            #missed 
            elif labels[i][j]== 1 and preds[i][j] == 0:
                suM += 1
            elif labels[i][j]== 0 and preds[i][j] == 1:
                suM += 1
        
        if suM != 0:
            total.append(correct/suM)

    return np.mean(total)
                

# take a time in seconds and returns a string hh:mm:ss
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    
    return str(datetime.timedelta(seconds=elapsed_rounded))


# Training

In [85]:
import random

seed_val = 1
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = {}

for annotator in tweets:
    training_stats[annotator] = []
    total_t0 = time.time() 

    train_dl = train_data_loaders[annotator]
    val_dl = val_data_loaders[annotator]
    print("***************************************")
    print('Training ', annotator.upper())

    # initialize BERT weights
    model.init_weights()
    for epoch_i in range(0, epochs):
        print("")
        print('======= Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print("")

        t0 = time.time()
        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dl):
            # periodically update elapsed time
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)

                print('Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, 
                                                                    len(train_dl), 
                                                                    elapsed))
            # unpack current batch's input & labels   
            cur_input_ids = batch[0].to(device)
            cur_input_mask = batch[1].to(device)
            cur_labels = batch[2].to(device)

            # clear previously calculated gradients
            model.zero_grad()

            # perform a forward pass
            # logits = classifications, before activation function e.x. softmax
            loss, logits = model(cur_input_ids, 
                                token_type_ids=None,
                                attention_mask=cur_input_mask,
                                labels=cur_labels)
                    
            # accumulate training loss
            total_train_loss += loss.item()

            # perform a backward pass to calculate the gradients of params
            loss.backward()

            # clip the gradients if it is not in [-1,1]
            # torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # update weights
            optimizer.step()

            # update lr
            scheduler.step()

        # calculate stats after 1 epoch of training
        avg_train_loss = total_train_loss / len(train_dl)
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

        # validation 
        print("")
        print("Validating...")

        t0 = time.time()

        model.eval()

        total_eval_accuracy_hamming_loss = 0
        total_eval_accuracy_exact_match = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in val_dl:
            
            # unpack current batch's input and labels
            cur_input_ids = batch[0].to(device)
            cur_input_mask = batch[1].to(device)
            cur_labels = batch[2].to(device)

            # no need to calculate and trace gradient
            with torch.no_grad():
                loss, logits = model(cur_input_ids,
                                    token_type_ids=None,
                                    attention_mask=cur_input_mask,
                                    labels=cur_labels)
                        
            # no need to calculate prediction & labels in gpu
            # good practice when involves with large-scale dataset
            logits = logits.detach().cpu().numpy()
            label_ids = cur_labels.to('cpu')
            
            # calculate this batch's accuracy & loss and accumulate with other 
            # batches' accuracies & loss
            total_eval_accuracy_hamming_loss += get_accuracy_hamming_loss(logits, label_ids)
            total_eval_accuracy_exact_match += get_accuracy_exact_match(logits, label_ids)
            
            total_eval_loss += loss.item()

        # average out loss and accuracy across all batches
        avg_val_accuracy_hamming_loss = total_eval_accuracy_hamming_loss / len(val_dl)
        avg_val_accuracy_exact_match = total_eval_accuracy_exact_match / len(val_dl)
        avg_val_loss = total_eval_loss / len(val_dl)
        validation_time = format_time(time.time() - t0)

        print("Loss: {0:.2f}, Time elapsed: {1:}".format(avg_val_loss, validation_time))
        print("==== Accuracy ====")
        print("Hamming Loss:", avg_val_accuracy_hamming_loss)
        print("Exact Match", avg_val_accuracy_exact_match)

        # record all statistics for this epoch
        training_stats[annotator].append({
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur': avg_val_accuracy_hamming_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        })


    print("")
    print("Training for {0:} Completed. Training took {1:}".format(annotator, format_time(time.time() - total_t0)))
    print("***************************************")
    print("")


print("Training Completed for ALL Annotators")

***************************************
Training  ANNOTATOR1


Batch    40 of   197. Elapsed: 0:00:11.
Batch    80 of   197. Elapsed: 0:00:21.
Batch   120 of   197. Elapsed: 0:00:32.
Batch   160 of   197. Elapsed: 0:00:42.

  Average training loss: 0.69
  Training epcoh took: 0:00:52

Validating...
Loss: 0.69, Time elapsed: 0:00:02
==== Accuracy ====
Hamming Loss: 0.8896991978609627
Exact Match 0.0025


Batch    40 of   197. Elapsed: 0:00:10.
Batch    80 of   197. Elapsed: 0:00:21.
Batch   120 of   197. Elapsed: 0:00:31.
Batch   160 of   197. Elapsed: 0:00:42.

  Average training loss: 0.69
  Training epcoh took: 0:00:52

Validating...
Loss: 0.69, Time elapsed: 0:00:02
==== Accuracy ====
Hamming Loss: 0.8896991978609627
Exact Match 0.0025


Batch    40 of   197. Elapsed: 0:00:11.
Batch    80 of   197. Elapsed: 0:00:21.
Batch   120 of   197. Elapsed: 0:00:32.
Batch   160 of   197. Elapsed: 0:00:42.

  Average training loss: 0.69
  Training epcoh took: 0:00:52

Validating...
Loss: 0.69, 

KeyboardInterrupt: ignored

In [61]:
# Display training stats
# Display numbers with two decimal
pd.set_option('precision', 2)

for annotator in tweets:
    df_stats = pd.DataFrame(data=training_stats[annotator])
    df_stats = df_stats.set_index('epoch')
    display(df_stats)

Unnamed: 0_level_0,Training Loss,Valid. Loss,Valid. Accur,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.69,0.69,0.87,0:01:35,0:00:03
2,0.69,0.69,0.87,0:01:35,0:00:03
3,0.69,0.69,0.87,0:01:35,0:00:03


# Testing

In [69]:
model.eval()

for annotator in tweets:
    total_accuracy_test = 0
    test_dl = test_data_loaders[annotator]

    for batch in test_dl:
        # unpack current batch's input and labels
        cur_input_ids = batch[0].to(device)
        cur_input_mask = batch[1].to(device)
        cur_labels = batch[2].to(device)

        # no need to calculate and trace gradient
        with torch.no_grad():
            loss, logits = model(cur_input_ids,
                                token_type_ids=None,
                                attention_mask=cur_input_mask,
                                labels=cur_labels)
                        
        # no need to calculate prediction & labels in gpu
        # good practice when involves with large-scale dataset
        logits = logits.detach().cpu().numpy()
        label_ids = cur_labels.to('cpu')
            
        # calculate this batch's accuracy & loss and accumulate with other 
        # batches' accuracies & loss
        total_accuracy_test += get_accuracy_hamming_loss(logits, label_ids)
            

    print('Testing Accuracy for {}: {:.2f}'.format(annotator, total_accuracy_test/len(test_dl)))

print('Testing Completed')


Predicting labels for 1,472 test sentences...
Testing Accuracy: 0.87
Testing Completed
