# Setting up the GPU for the program

In [1]:
import tensorflow as tf


# Get the GPU device name:
device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
  print('Found GPU at: {}'.format(device_name))
else:
  raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [2]:
import torch

if torch.cuda.is_available():

  device = torch.device('cuda')
  print('There are %d GPU(s) available' % torch.cuda.device_count())
  print('This program is using GPU:', torch.cuda.get_device_name(0))

else:
  print('No GPU available, using CPU instead.')
  device = torch.device('cpu')


There are 1 GPU(s) available
This program is using GPU: Tesla T4


Installing the Hugging Face Library and the emoji library



In [3]:
!pip install transformers
!pip install emoji



# Loading in the annotated Abuse Dataset




In [4]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data inspection


In [5]:
import pandas as pd
import emoji
import re
from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import numpy as np



path = '/content/drive/MyDrive/DALC/'

# manually curate split
#train = pd.read_csv(path + 'dalc_v2_train.csv', delimiter='\t', header=0,  names=['id', 'text', 'user', 'offensive',
#                                                                       'abusive', 'target',
#                                                                       'source', 'user.description'])
#
#dev = pd.read_csv(path + 'dalc_v2_dev.csv', delimiter='\t', header=0,  names=['id', 'text', 
#                                                                   'user', 'offensive',
#                                                                   'abusive', 'target',
#                                                                   'source', 'user.description'])
#

#test = pd.read_csv(path + 'dalc_v2_test.csv', delimiter='\t', header=0,  names=['id', 'text',
#                                                        'user', 'offensive',
#                                                        'abusive', 'target',
#                                                        'source', 'user.description'])

# random split
train = pd.read_csv(path + 'dalc_v2_train_random.csv', delimiter='\t', header=0,  names=['id', 'text', 'user', 'offensive',
                                                                       'abusive', 'target',
                                                                       'source', 'user.description'])

dev = pd.read_csv(path + 'dalc_v2_dev_random.csv', delimiter='\t', header=0,  names=['id', 'text', 
                                                                   'user', 'offensive',
                                                                   'abusive', 'target',
                                                                   'source', 'user.description'])


test = pd.read_csv(path + 'dalc_v2_test_random.csv', delimiter='\t', header=0,  names=['id', 'text',
                                                        'user', 'offensive',
                                                        'abusive', 'target',
                                                        'source', 'user.description'])



print('Number of training sentences: {:,}\n'.format(train.shape[0]))
print('Number of dev sentences: {:,}\n'.format(dev.shape[0]))
print('Number of test sentences: {:,}\n'.format(test.shape[0]))
print(train[['text', 'abusive', 'target']].head())

Number of training sentences: 5,706

Number of dev sentences: 549

Number of test sentences: 1,901

                                                text   abusive      target
0  Misschien de ideale assistente van Francesca V...       NOT         NaN
1  en er stond een guy tegenover mij mr ik moest ...  EXPLICIT  INDIVIDUAL
2  #NL_actueel Uitgewezen asielzoekers steeds vak...       NOT         NaN
3  @beatsbyarti @mrouwen met wat je hier nu zegt,...  EXPLICIT  INDIVIDUAL
4  Waar komt al dat vertrouwen in @SanderDekker t...       NOT         NaN


The sentence column will be used during the classifcation task.


In [6]:
# Placing the sentence and label columns into a list of values

train_labels = train.abusive.values
dev_labels = dev.abusive.values
test_labels = test.abusive.values

# Reformatting the labels binary, 0 = not abusive, 1 = abusive
def reformat_labels(labels):
  b_labels = []
  abusive_count = 0
  not_count = 0
  explicit_count = 0
  implicit_count = 0
  
  for label in labels:
    # binary
    #if label == 'NOT':
    #  not_count += 1
    #  b_labels.append(0)
    #else:
    #  abusive_count += 1
    #  b_labels.append(1)

    # ternary
    if label == 'NOT':
      not_count += 1
      b_labels.append(0)
    elif label == 'EXPLICIT':
      explicit_count += 1
      b_labels.append(1)
    else:
      abusive_count += 1
      b_labels.append(2)


#  return b_labels, abusive_count, not_count # binary
  return b_labels, explicit_count, implicit_count, not_count # ternary

# binary
#print('Formatting train labels:')
#train_labels, ab_count, not_count = reformat_labels(train_labels)
#print('Abusive: {} | Not: {}'.format(ab_count, not_count))
#print('Formatting dev labels:')
#dev_labels, ab_count, not_count = reformat_labels(dev_labels)
#print('Abusive: {} | Not: {}'.format(ab_count, not_count))
#print('Formatting test labels:')
#test_labels, ab_count, not_count = reformat_labels(test_labels)
#print('Abusive: {} | Not: {}'.format(ab_count, not_count))

# ternary
print('Formatting train labels:')
train_labels, ab_count_exp, ab_count_imp, not_count = reformat_labels(train_labels)
print('Exp: {} | Imp: {} | Not: {}'.format(ab_count_exp, ab_count_imp, not_count))
print('Formatting dev labels:')
dev_labels, ab_count_exp, ab_count_imp, not_count = reformat_labels(dev_labels)
print('Exp: {} | Imp: {} | Not: {}'.format(ab_count_exp, ab_count_imp, not_count))
print('Formatting test labels:')
test_labels, ab_count_exp, ab_count_imp, not_count = reformat_labels(test_labels)
print('Exp: {} | Imp: {} | Not: {}'.format(ab_count_exp, ab_count_imp, not_count))



#print('Checking if labels were formatted correctly:')
#print(train_labels)


Formatting train labels:
Exp: 738 | Imp: 0 | Not: 4564
Formatting dev labels:
Exp: 77 | Imp: 0 | Not: 439
Formatting test labels:
Exp: 412 | Imp: 0 | Not: 1264


# Pre-processing steps
For pre-processing we decided to change each user mention (@folkert) to Name, change links to URL, remove numbers and emojis.


In [7]:
def clean_samples(data):

  new_samples = []
  #print(data.head())

  content = list(data['text'].values)
  for tweet_message in content:
      tweet_message = re.sub(r'https.*[^ ]', 'URL', tweet_message)
      tweet_message = re.sub(r'http.*[^ ]', 'URL', tweet_message)
      tweet_message = re.sub(r'@([^ ]*)', '@USER', tweet_message)
      tweet_message = emoji.demojize(tweet_message)
      tweet_message = re.sub(r'(:.*?:)', r' \1 ', tweet_message)
      tweet_message = re.sub(' +', ' ', tweet_message)
      new_samples.append(tweet_message)

  return new_samples

# Formatting other dataframes as well
train_clean = clean_samples(train) # list
dev_clean = clean_samples(dev) # list
test_clean = clean_samples(test) # list

#print(dev_clean[0:3])




# Tokenization & Input Formatting

In [8]:
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('wietsedv/bert-base-dutch-cased')

Loading BERT tokenizer...


In [9]:
# Inspecting the output of the tokenizer

# Original Sentence
print('Original: ', train_clean[0])
# Sentence split into tokens
print('Tokenized: ', tokenizer.tokenize(train_clean[0]))
# Sentence mapped to token ids
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_clean[0])))

Original:  Misschien de ideale assistente van Francesca Vanthielen in het #communicatie-team van #unia? Werken aan de campagne "#racisme en #klimaat"? URL
Tokenized:  ['Miss', '##chi', '##en', 'de', 'ideale', 'assistent', '##e', 'van', 'Francesca', 'Van', '##thi', '##elen', 'in', 'het', '[UNK]', 'communicatie', '-', 'team', 'van', '[UNK]', 'uni', '##a', '?', 'Werken', 'aan', 'de', 'campagne', '"', '[UNK]', 'racisme', 'en', '[UNK]', 'klimaat', '"', '?', 'U', '##RL']
Token IDs:  [4900, 24589, 25108, 10537, 13566, 8605, 117, 20722, 2717, 7222, 28981, 25070, 13644, 13261, 0, 10250, 12, 19884, 20722, 0, 20651, 113, 27, 7593, 7862, 10537, 10052, 6, 0, 17939, 11281, 0, 14357, 6, 27, 47, 23618]


In [10]:
max_len = 0
length = []
for sent in train_clean:
  
  # Tokenize the text and adding [CLS] and [SEP] tokens
  input_ids = tokenizer.encode(sent, add_special_tokens = True)

  # Update the maximum sentence length
  max_len = max(max_len, len(input_ids))
  length.append(len(input_ids))

avg_len = sum(length) // len(length)

print('Max sentence length: ', max_len)
print('Average sentence length: ', avg_len)
print(length)

Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  528
Average sentence length:  36
[39, 73, 18, 61, 16, 22, 18, 17, 16, 31, 22, 15, 22, 83, 29, 41, 33, 21, 19, 17, 34, 34, 72, 30, 16, 26, 24, 22, 49, 28, 20, 48, 23, 49, 27, 76, 58, 23, 14, 39, 20, 18, 87, 8, 31, 31, 13, 77, 17, 32, 28, 11, 84, 13, 28, 60, 36, 4, 91, 14, 26, 74, 37, 17, 46, 29, 37, 26, 17, 22, 75, 30, 51, 19, 72, 52, 72, 14, 37, 12, 36, 7, 34, 63, 38, 33, 37, 16, 56, 33, 25, 27, 22, 25, 23, 33, 70, 32, 23, 20, 23, 30, 31, 17, 19, 20, 15, 48, 31, 16, 10, 25, 16, 22, 69, 16, 13, 63, 32, 60, 117, 66, 43, 37, 47, 24, 35, 23, 26, 29, 27, 41, 25, 21, 83, 18, 19, 29, 23, 17, 19, 49, 40, 17, 63, 58, 23, 31, 77, 31, 33, 170, 24, 72, 45, 15, 40, 29, 61, 32, 38, 40, 64, 11, 36, 51, 31, 38, 56, 22, 75, 65, 53, 7, 33, 19, 45, 60, 33, 68, 55, 54, 32, 17, 25, 45, 14, 36, 35, 22, 23, 24, 63, 21, 56, 39, 5, 34, 45, 44, 22, 23, 72, 24, 38, 46, 59, 69, 53, 17, 12, 25, 39, 31, 25, 17, 49, 16, 28, 24, 62, 25, 33, 39, 22, 11, 25, 47, 11, 96, 32, 50, 32, 54, 35, 24, 30,

The code above shows that the maximum sentence and the average sentence length, on which the max sentence length can be based

The `tokenizer.encode_plus` will do the acutal tokenization:


1.   Split the sentences into tokens
2.   Add the special `[CLS]` and `[SEP]` tokens
3.   Map the tokens to their IDS
4.   Pad or truncate all sentences to the same length
5.   Create the attention masks which explicitly differentiate real tokens from `[PAD]` tokens


NOTE: `tokenizer.encode` does not add the `[PAD]` tokens, therefore I will be using `tokenizer.encode_plus`







In [11]:
# Tokenize all of the sentences and map the tokens to their word ID, both for training, as for test

def prepare_bert_data(sentences):
  input_ids = []
  attention_masks = []

  for sent in sentences:
    encoded_dict = tokenizer.encode_plus(sent, # the sentence
                                        add_special_tokens = True,  # add [CLS] and [SEP] tokens
                                        max_length = 100,   # PAD and truncate all sentences
                                        pad_to_max_length = True,
                                        truncation=True,  
                                        return_attention_mask = True, # Construct attn. masks
                                        return_tensors = 'pt'       # Return pytorch tensors
                                        )

    # Add the encoded sentence to the list
    input_ids.append(encoded_dict['input_ids'])

    # Add its attention mask (Differentiates padding from non-padding)
    attention_masks.append(encoded_dict['attention_mask'])

  # Converting the lists into tensors
  input_ids = torch.cat(input_ids, dim = 0)
  attention_masks = torch.cat(attention_masks, dim = 0)

  return input_ids, attention_masks

train_input_ids, train_attention_masks = prepare_bert_data(train_clean)
train_labels = torch.tensor(train_labels)

dev_input_ids, dev_attention_masks = prepare_bert_data(dev_clean)
dev_labels = torch.tensor(dev_labels)

test_input_ids, test_attention_masks = prepare_bert_data(test_clean)
test_labels = torch.tensor(test_labels)

# Print sentence 0, now a list of IDS.
print('Original Dev: ', dev_clean[0])
print('Token IDs: ', dev_input_ids[0])



Original Dev:  @USER @USER In #Breda worden kinderen gefilmd door #huisarts Baban&amp;Kappelhof. @USER klokkenluider nu dood. Lees tweets @USER 
#pedos #depla
Token IDs:  tensor([    1,     0,  7127, 23317,     0,  7127, 23317,  3570,     0,  1440,
        22591, 14261, 12018, 10871,     0, 13480,  1020, 23967,     7,     0,
           26,  3898, 27726, 25957,    13,     0,  7127, 23317, 14369, 16155,
        10850,    13,  4276, 20433,   132,   131,     0,  7127, 23317,     0,
        17343, 24921,     0, 10537, 27792,     2,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3])


# Training & Validating Split
For the thesis task, we have already divided the data into the final split

In [12]:
# Combining the training inputs into a TensorDataset
train_ = TensorDataset(train_input_ids, train_attention_masks, train_labels)
dev_ = TensorDataset(dev_input_ids, dev_attention_masks, dev_labels)
test_ = TensorDataset(test_input_ids, test_attention_masks, test_labels)

print('{:>5,} training samples'.format(len(train_)))
print('{:>5,} validation samples'.format(len(dev_)))

5,706 training samples
  549 validation samples


Below, I will create an iterator for my dataset, by using the torch `DataLoader` class. This helps save on memory during training beause, unlinke a for loop, with iterator the entire dataset does not need to be loaded into memory.

In [13]:
print('Initializing Dataloaders......')
# Specifying the batch size, for fine tuning, 16 or 32 is recommended.
batch_size = 32

# Creating the dataloaders, train is randomlly sampled, test is sequentially sampled
train_dataloader = DataLoader(train_,
                              sampler = RandomSampler(train),
                              batch_size = batch_size
                              )

validation_dataloader = DataLoader(dev_,
                                    sampler = RandomSampler(dev), # Original was a SequentialSampler(test)
                                    batch_size = batch_size
                                  )

test_dataloader = DataLoader(test_,
                            sampler = SequentialSampler(test),
                            batch_size = batch_size
                            )

Initializing Dataloaders......


# Training the Classification Model
Now that I have tokenized and loaded all the data, I will fine-tune BERT for sequence classification

In [14]:
from transformers import BertForSequenceClassification, AdamW, BertConfig


# Loading M-bert base-uncased 102 languages - binary
#model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', 
#                                                        num_labels = 2,       # The number of output labels for binary classifcation
#                                                        output_attentions = False, # Whether or not the model returns attentions weights
#                                                        output_hidden_states = False  # Whether or not the model returns all hidden states
#                                                      )


# Loading BERTje model - binary:
#model = BertForSequenceClassification.from_pretrained('GroNLP/bert-base-dutch-cased', 
#                                                        num_labels = 2,       # The number of output labels for binary classifcation
#                                                        output_attentions = False, # Whether or not the model returns attentions weights
#                                                        output_hidden_states = False  # Whether or not the model returns all hidden states
#                                                      )

# Loading BERTje model - ternary:
model = BertForSequenceClassification.from_pretrained('GroNLP/bert-base-dutch-cased', 
                                                        num_labels = 3,       # The number of output labels for binary classifcation
                                                        output_attentions = False, # Whether or not the model returns attentions weights
                                                        output_hidden_states = False  # Whether or not the model returns all hidden states
                                                      )


# Telling pytorch to run this model on the GPU
model.cuda()

Some weights of the model checkpoint at GroNLP/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=3)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

Inspecting the BERT model

In [15]:
from transformers import get_linear_schedule_with_warmup
# Initializing the optimizer, learning rate = 2e-5 and the epsilon is set to 1e-8 to prevent any zero division errors
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

# I have chosen 4 epochs at the moment, but this might cause overfitting
epochs = 5

# Calculating how many steps have to be taken
total_steps = len(train_dataloader) * epochs

# Creating the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

**Creating the training loop for the classification task**

The process for this loop is taken from the example which states:

**Training:**


*   Unpack our data inputs and labels
*   Load data onto GPU for acceleration
*   Clear out the gradients calculated in the previous pass
*   Forward pass
*   Backward pass
*   Tell the network to update parameters with `optimizer.step()`
*   Track variables for monitoring progress


**Evaluation**


*   Unpack our data inputs and labels
*   Load data onto the GPU for acceleration
*   Forward pass (feed input data through network)
*   Comput loss on our validation data and track variables for monitoring progress


Below, I have first created a helper function that can calculate the Accuracy.

In [16]:
import time
import datetime
import numpy as np


def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis = 1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)


def get_preds(preds, labels):
  return np.argmax(preds, axis = 1).flatten(), labels.flatten()


def analyze_preds(preds, labels):
  tp = 0
  tn = 0
  fp = 0
  fn = 0
  for index, val in enumerate(preds):
    if val == 1 and labels[index] == 1:
      # true positive
      tp += 1
    elif val == 1 and labels[index] == 2: # ternary
      # true positive
      tp += 1
    elif val == 0 and labels[index] == 0: 
      # true negative
      tn += 1
    elif val == 1 and labels[index] == 0:
      # false positive
      fp += 1
    elif val == 2 and labels[index] == 0: # ternary
      # false positive
      fp += 1 
    elif val == 1 and labels[index] == 2: # ternary
      # false positive
      fp += 1
    elif val == 2 and labels[index] == 1: # ternary
      # false positive
      fp += 1
    else:
      # false negative
      fn += 1
    
  return [tp, tn, fp, fn]


def format_time(elapsed):
  """ Takes a time in seconds and returns a string in hh:mm:ss"""
  elapsed_rounded = int(round(elapsed))
  return str(datetime.timedelta(seconds=elapsed_rounded))



In [17]:
#print(train_dataloader)
#for step, batch in enumerate(train_dataloader):
#  print(batch)

**Starting training**

In [18]:
import random
import numpy as np
from sklearn.metrics import confusion_matrix, f1_score, precision_recall_fscore_support, precision_score, recall_score, classification_report

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

final_prec = []
final_rec = []
final_f1 = []
final_conf_matrix = []

# Time measuring the training process
total_t0 = time.time()

# Tracking lists for calculating the average F1 score over the epochs

for epoch_i in range(0, epochs):

  # ================================
  #           TRAINING!
  # ================================
  print("")
  print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
  print('Training...')
  final = False
  # Checking if the current epoch is the final one
  if epoch_i + 1 == epochs:
    final = True
  # Measuring how long one epoch takes:
  t0 = time.time()

  # Reset the total loss for this epoch
  total_train_loss = 0

  # Chaning the mode of the model to train:
  model.train()
  train_preds = []
  train_labels = []

  # Tracking lists for the scorers:
  train_prec = []
  train_rec = []
  train_f1 = []

  test_prec = []
  test_rec = []
  test_f1 = []

  # For each batch of training data...
  for step, batch in enumerate(train_dataloader):

    # Progress update every 40 batches ( even bekijken hoe ik dit met abusive data wil doen)
    if step % 40 == 0 and not step == 0:
      # calculating elapsed time in minutes
      elapsed = format_time(time.time() - t0)
      # reporting progress
      print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

    # Hier nog controleren of de lengte van de batch wel hetzelfde is
    #print(len(batch))

    # Copying each tensor to the GPU
    # Batch contains:
    # [0] = input ids
    # [1] = attention masks
    # [2] = labels

    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # Clearing previously calculated gradients
    model.zero_grad()

    # Performing a forward pass
    loss, logits = model(b_input_ids,
                         token_type_ids=None,
                         attention_mask=b_input_mask,
                         labels=b_labels,
                         return_dict=False)
    
  #  print(logits)
  
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    
    # Accumulate the training loss over all of the batches (for average loss at the end)
    total_train_loss += loss.item()
    preds, labs = get_preds(logits, label_ids)
    train_preds.extend(preds)
    train_labels.extend(labs)
    # Performing a backward pass to calculate the gradients
    loss.backward()

    # Clipping the norm of the gradients to 1.0
    # This prevents 'exploding gradients'
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # Updating the parameters and take a step using the computed gradient
    optimizer.step()

    # Update the learning rate
    scheduler.step()


  conf_matrix = confusion_matrix(train_labels, train_preds)
  print('Confusion Matrix: ')
  print(conf_matrix)
  print("\n")

  # binary
  #tn, fp, fn, tp = conf_matrix.ravel() # binary
  #print('TN: {} | TP: {} | FN: {} | FP: {}'.format(tn, tp, fn, fp)) # binary

  # ternary values
  FP = conf_matrix.sum(axis=0) - np.diag(conf_matrix) # ternary
  FN = conf_matrix.sum(axis=1) - np.diag(conf_matrix) # ternary
  TP = np.diag(conf_matrix) # ternary
  TN = conf_matrix.sum() - (FP + FN + TP) # ternary
  
  fp = FP.astype(float) # ternary
  fn = FN.astype(float) # ternary
  tp = TP.astype(float) # ternary
  tn = TN.astype(float) # ternary

  print("\n")
  precision = tp / (tp + fp)
  recall = tp / (tp + fn)
  f_score = 2 * ((precision * recall) / (precision + recall))
  print('Precision: ', precision)
  print('Recall: ', recall)
  print('F1: ', f_score)
  print("\n")
  # Calculate the average loss over all of the batches
  avg_train_loss = total_train_loss / len(train_dataloader)

  # Measure how long this epoch took
  training_time = format_time(time.time() - t0)
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epcoh took: {:}".format(training_time))
  print("\n")

  # ========================================
  #               Validation
  # ========================================
  # After the completion of each training epoch, measure our performance on
  # our validation set.

  print("Running Validation")
  print("\n")
  t0 = time.time()

  # Put the model into evaluation mode
  model.eval()

  # Tracking variables
  total_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0
  tp = 0
  fp = 0
  fn = 0
  tn = 0
  predicted_labels = []
  actual_labels = []

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Unpacking the validation batch
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    # For validation, we don't need to construct the compute graph
    with torch.no_grad():

      (loss, logits) = model(b_input_ids,
                             token_type_ids = None,
                             attention_mask = b_input_mask,
                             labels = b_labels,
                             return_dict=False)
    
    # Accumulate the validation loss
    total_eval_loss += loss.item()

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Calculating the accuracy for this batch of test sentences
    total_eval_accuracy += flat_accuracy(logits, label_ids)
    preds, labs = get_preds(logits, label_ids)
    predicted_labels.extend(preds)
    actual_labels.extend(labs)

    results = analyze_preds(preds, labs)
    tp += results[0]
    fp += results[2]
    fn += results[3]
    tn += results[1]

  
  precision = tp / (tp + fp)
  recall = tp / (tp + fn)
  f_score = 2 * (precision * recall) / (precision + recall)
  print('Precision: ', precision)
  print('Recall: ', recall)
  print('F1: ', f_score)
  print("\n")
  #print('Confusion Matrix:') # binary
  #conf_matrix = confusion_matrix(actual_labels, predicted_labels) # binary
  #print(conf_matrix) # binary
  #print('TN: {} | TP: {} | FN: {} | FP: {}'.format(tn, tp, fn, fp)) # binary
  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print(classification_report(actual_labels, predicted_labels))
 # print('Sklearn Precision Micro ', precision_score(actual_labels, predicted_labels, average='micro'))
 # print('Sklearn Precision Macro ', precision_score(actual_labels, predicted_labels, average='macro'))
 # print('Sklearn Recall Micro ', recall_score(actual_labels, predicted_labels, average='micro'))
 # print('Sklearn Recall Macro ', recall_score(actual_labels, predicted_labels, average='macro'))
  #print('Sklrean f1 score (Macro): ', f1_score(actual_labels, predicted_labels, average='macro'))

  print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
  print("\n")
  if final:
    print('Final epoch has been reached, appending results...')
    print("\n")
    final_prec.append(precision)
    final_rec.append(recall)
    final_f1.append(f_score)
    final_conf_matrix = conf_matrix
  # Calculate the average loss over all of the batches.
  avg_val_loss = total_eval_loss / len(validation_dataloader)
  
  # Measure how long the validation run took.
  validation_time = format_time(time.time() - t0)
  
  print("  Validation Loss: {0:.2f}".format(avg_val_loss))
  print("  Validation took: {:}".format(validation_time))
  print("\n")
  # Record all statistics from this epoch.
  training_stats.append(
      {
          'epoch': epoch_i + 1,
          'Training Loss': avg_train_loss,
          'Valid. Loss': avg_val_loss,
          'Valid. Accur.': avg_val_accuracy,
          'Training Time': training_time,
          'Validation Time': validation_time
      }
  )
  
#print("")
#print("Training complete!")
#print("\n")
#print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
#print("\n")
#print('Precision: ', final_prec)
#print('Recall: ', final_rec)
#print('F1: ', final_f1)



Training...
  Batch    40  of    179.    Elapsed: 0:00:21.
  Batch    80  of    179.    Elapsed: 0:00:42.
  Batch   120  of    179.    Elapsed: 0:01:03.
  Batch   160  of    179.    Elapsed: 0:01:25.
Confusion Matrix: 
[[4400  164    0]
 [ 446  292    0]
 [ 330   71    3]]




Precision:  [0.85007728 0.5540797  1.        ]
Recall:  [0.96406661 0.39566396 0.00742574]
F1:  [0.90349076 0.46166008 0.01474201]


  Average training loss: 0.48
  Training epcoh took: 0:01:35


Running Validation


Precision:  0.8787878787878788
Recall:  0.26605504587155965
F1:  0.4084507042253521


              precision    recall  f1-score   support

           0       0.84      0.99      0.91       439
           1       0.88      0.36      0.51        77
           2       0.00      0.00      0.00        33

    accuracy                           0.85       549
   macro avg       0.57      0.45      0.48       549
weighted avg       0.80      0.85      0.80       549

  Accuracy: 0.84


  Validation Loss:

Running on the test set


In [19]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predicted_labels , true_labels = [], []

# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask,return_dict=False)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  preds, labs = get_preds(logits, label_ids)
  
  # Store predictions and true labels
  predicted_labels.extend(preds)
  true_labels.extend(labs)

print(classification_report(true_labels, predicted_labels))
#print('Sklearn Precision Micro ', precision_score(true_labels, predicted_labels, average='micro'))
#print('Precision Macro ', precision_score(true_labels, predicted_labels, average='macro'))
#print('Sklearn Recall Micro ', recall_score(true_labels, predicted_labels, average='micro'))
#print('Sklearn Recall Macro ', recall_score(true_labels, predicted_labels, average='macro'))
#print('Sklrean f1 score (Macro): ', f1_score(true_labels, predicted_labels, average='macro'))

print('    DONE.')

Predicting labels for 17 test sentences...
              precision    recall  f1-score   support

           0       0.82      0.92      0.87      1264
           1       0.74      0.63      0.68       412
           2       0.38      0.23      0.28       225

    accuracy                           0.78      1901
   macro avg       0.65      0.59      0.61      1901
weighted avg       0.75      0.78      0.76      1901

    DONE.


In [20]:
# save model 

#model_save_name = 'te+red_extraf:red-mnt_5-classifier.pt'
#path = F"/content/drive/My Drive/Colab Notebooks/models/{model_save_name}"  
#torch.save(model.state_dict(), path)

In [21]:
predictions = list(zip(test, predicted_labels, true_labels))
print(predictions[0:3])


[('id', 1, 1), ('text', 0, 0), ('user', 1, 1)]
