Author: Thijs Brekhof

# Installing packages and specifying gpu

In [1]:
from collections import Counter

In [2]:
import tensorflow as tf
# Verifying GPU availability (you have to turn it on in google colab)
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [3]:
# Installing necessary packages
!pip install pytorch-pretrained-bert pytorch-nlp

# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

# Loading dataset and applying preprocessing

In [80]:
# Loading csv file as df and removing irrelevant / empty (= multiple frames) frames
train_df = pd.read_csv('Train.csv')
validation_df = pd.read_csv('Validation.csv')

# Testing only 5 most frequent occuring topics
# train_df = train_df.loc[train_df['topic'].isin(['Other','Legality, constitutionality and jurisprudence', 'Fairness and equality', 'Quality of Life', 'Cultural identity'])]
# validation_df = validation_df.loc[validation_df['topic'].isin(['Other','Legality, constitutionality and jurisprudence', 'Fairness and equality', 'Quality of Life', 'Cultural identity'])]
# validation_df = validation_df.drop(validation_df[validation_df['topic'] == 'Other'].sample(frac=0.5).index)


In [81]:
print(train_df.shape)
print(validation_df.shape)

(1691, 12)
(235, 12)


In [82]:
# Creating list of comments and corresponding labels
comments_train = train_df.comment_text.values
comments_val = validation_df.comment_text.values
# adding begin and end of sequence tokens that BERT recognises
comments_train = ["[CLS] " + comment + " [SEP]" for comment in comments_train]
comments_val = ["[CLS] " + comment + " [SEP]" for comment in comments_val]

# Getting labels           
labels_train = train_df.topic.values
labels_val = validation_df.topic.values
print(Counter(labels_val))
# Getting unique labels 
un_labels = set()
for label in labels_train:
  un_labels.add(label)
print(len(un_labels), un_labels)
# Rewriting labels from str to int so it can be 'understood' by the classifier 
le = preprocessing.LabelEncoder()
le.fit(sorted(un_labels))

labels_train = le.transform(labels_train)
labels_val = le.transform(labels_val)

Counter({'Other': 56, 'Cultural identity': 32, 'Legality, constitutionality and jurisprudence': 26, 'Fairness and equality': 17, 'Historical': 17, 'Morality': 17, 'Quality of Life': 15, 'Economic': 12, 'Capacity and resources': 10, 'Policy prescription and evaluation': 9, 'Crime and punishment': 5, 'Climate and environment': 4, 'Technology and innovation': 4, 'Public opinion': 3, 'Health and Safety': 3, 'Education': 2, 'International relations and reputation': 1, 'Security and defense': 1, 'Political': 1})
19 {'Fairness and equality', 'Historical', 'Legality, constitutionality and jurisprudence', 'Technology and innovation', 'Cultural identity', 'Health and Safety', 'International relations and reputation', 'Security and defense', 'Capacity and resources', 'Other', 'Political', 'Public opinion', 'Economic', 'Policy prescription and evaluation', 'Education', 'Climate and environment', 'Quality of Life', 'Crime and punishment', 'Morality'}


In [83]:
# Getting BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts_train = [tokenizer.tokenize(comment) for comment in comments_train]
tokenized_texts_val = [tokenizer.tokenize(comment) for comment in comments_val]
# Example of the first comment tokenized
# print ("Tokenize the first comment:")
# print (tokenized_texts[0])

In [84]:
# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids_train = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts_train]
input_ids_val = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts_val]

In [85]:
# Pad our input tokens
input_ids_train = pad_sequences(input_ids_train, maxlen=305, dtype="long", truncating="post", padding="post")
input_ids_val = pad_sequences(input_ids_val, maxlen=305, dtype="long", truncating="post", padding="post")

In [86]:
# Create attention masks
attention_masks_train = []
attention_masks_val = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids_train:
  seq_mask = [float(i>0) for i in seq]
  attention_masks_train.append(seq_mask)

for seq in input_ids_val:
  seq_mask = [float(i>0) for i in seq]
  attention_masks_val.append(seq_mask)

In [87]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(input_ids_train)
validation_inputs = torch.tensor(input_ids_val)
train_labels = torch.tensor(labels_train)
validation_labels = torch.tensor(labels_val)
train_masks = torch.tensor(attention_masks_train)
validation_masks = torch.tensor(attention_masks_val)

In [88]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 16

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Making model ready for training

In [89]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=19)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

In [90]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [91]:
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=1e-5,
                     warmup=.1)

t_total value of -1 results in schedule not being applied


In [92]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

# Training the model

In [None]:
t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 5

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Train loss: 2.465976764570992


Epoch:  20%|██        | 1/5 [02:01<08:05, 121.29s/it]

Validation Accuracy: 0.3098484848484848


In [None]:
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

# Testing the model on test set

In [None]:
# Importing and processing test data
test_df = pd.read_csv('Test.csv')

# For 5 most frequent, uncomment next line
# test_df = test_df.loc[test_df['topic'].isin(['Other','Legality, constitutionality and jurisprudence', 'Fairness and equality', 'Quality of Life', 'Cultural identity'])]

print(test_df.shape)
comments_test = test_df.comment_text.values
comments_test = ["[CLS] " + comment + " [SEP]" for comment in comments_test]
labels = test_df.topic.values
print(Counter(labels))

labels = le.transform(labels)
tokenized_texts = [tokenizer.tokenize(sent) for sent in comments_test]

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=305, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

batch_size = 16

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

In [None]:
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import matthews_corrcoef
matthews_set = []

for i in range(len(true_labels)):
  matthews = matthews_corrcoef(true_labels[i],
                 np.argmax(predictions[i], axis=1).flatten())
  matthews_set.append(matthews)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# Flatten the predictions and true values for aggregate Matthew's evaluation on the whole dataset
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

print(flat_predictions)

print("MCC: {0}".format(matthews_corrcoef(flat_true_labels, flat_predictions)))
print("Accuracy: {0}".format(accuracy_score(flat_true_labels, flat_predictions)))
scores = precision_recall_fscore_support(flat_true_labels, flat_predictions, average="macro")
print("Precision: {0}\nRecall: {1}\nF1: {2}".format(scores[0], scores[1], scores[2]))

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
from matplotlib import pyplot


matrix_labels = []
count = 0
for label in sorted(un_labels):
  matrix_labels.append("{0} ({1})".format(label, str(count)))
  count += 1
  
pyplot.figure(figsize=(13, 11), dpi=600)
cf_matrix = confusion_matrix(flat_true_labels, flat_predictions)
sns.heatmap(cf_matrix, annot=True, yticklabels=matrix_labels)


In [None]:
# Baseline - dummy classifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import classification_report

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(train_inputs, train_labels)

prediction = dummy_clf.predict(flat_predictions)
dummy_clf.score(prediction_inputs, labels)
print(classification_report(labels, prediction, digits=4))