# DistilBERT - Trained on the AG News Dataset
## Team: SHAP
This notebook sets up and trains a DistilBERT model.
A DistilBERT tokenizer is used to parse data from the AG News dataset
and is then converted into tensors to be fed to our model.

Testing is the next step to be implemented with this model.

## Prerequisites / Installations

In [1]:
!pip install -q datasets
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import torch

if torch.cuda.is_available():
  device = torch.device("cuda")
  print("GPU Available: ", torch.cuda.get_device_name(0))
else:
  device = torch.device("cpu")
  print("No GPU Available, Switching To CPU.")

GPU Available:  Tesla T4


.state_dict()## Load and check data.

In [3]:
from datasets import load_dataset

#Load train & test data
ag_news_train = load_dataset("ag_news", split='train')

#Use subset of original training set for faster training
NUM_SAMPLES = 10000
train_titles = [row['text'] for row in ag_news_train][:NUM_SAMPLES]
train_labels = [row['label'] for row in ag_news_train][:NUM_SAMPLES]


Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading and preparing dataset ag_news/default to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


## Set Up DistilBERT Tokenizer

In [4]:
from transformers import DistilBertTokenizer


#Setting up tokenizer for standard DistilBERT Model with 12 layers

tokenizer_params = "distilbert-base-uncased"

tokenizer = DistilBertTokenizer.from_pretrained(
    tokenizer_params, do_lower_case=True)


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

## Find Max Title Length (Post-Tokenization) in Dataset

In [5]:
"""
DistilBERT requires a fixed-length input. Therefore we need to pad or truncate 
each sentence in order to make it fit a certain size. Our first step is to 
figure out what is the largest title token-wise and set our fixed length.
"""

max_title_length = 0

#Iterate through each instance
for title in train_titles:
  #Encode our title
  input_ids = tokenizer.encode(title, add_special_tokens=True)
  title_length = len(input_ids)

  #Check if its length sets a new maximum
  max_len = max(max_title_length, title_length)

print("====Finding MSL For Fixed-Length Input====")
print("Maximum Title Length: ", max_len) 

====Finding MSL For Fixed-Length Input====
Maximum Title Length:  27


## Encode Data With Tokenizer




In [6]:
import torch

input_ids = []
attention_masks = []

margin_of_error = 10

for title in train_titles:
  """
  Process of tokenization:
  - Tokenize all of the news articles
  - Add [CLS] token to the beginning, signifies beginning
  - Add [SEP] token to the end, signifies end
  - Map tokens to their numerical IDs
  - Make the sentences a fixed length through padding or truncating
  - Create attention masks for [PAD] tokens 
  """
  encoded_dict = tokenizer.encode_plus(
      title, 
      add_special_tokens = True,
      max_length = max_title_length + margin_of_error,
      pad_to_max_length = True,
      return_attention_mask = True,
      return_tensors = 'pt',
  )

  #Store encoded title to our input IDs
  input_ids.append(encoded_dict['input_ids'])

  #Store attention mask
  attention_masks.append(encoded_dict['attention_mask'])

#Concatenate input_ids and attention_masks 
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

#Convert training labels to tensor
labels = torch.tensor(train_labels)



Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


## Split Data and Configure Dataloaders





In [7]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

#Form a dataset from the previous three components
dataset = TensorDataset(input_ids, attention_masks, labels)

"""
Our actual training dataset will incorporate ninety percent of our original 
training data, while the validation dataset will consist of the 
remaining ten percent. This operation will be performed using a random
split.
"""
BATCH_SIZE = 16

train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
training_set, validation_set = random_split(dataset, [train_size, val_size])

print("====Training & Validation Split====")
print("Training Size: {}".format(train_size))
print("Validation Size: {}".format(val_size))


train_dataloader = DataLoader(
    training_set,
    sampler = RandomSampler(training_set),
    batch_size = BATCH_SIZE
)

validation_dataloader = DataLoader(
    validation_set,
    sampler = SequentialSampler(validation_set),
    batch_size = BATCH_SIZE
)

====Training & Validation Split====
Training Size: 9000
Validation Size: 1000


## Initialise Model Before Training

In [8]:
from transformers import DistilBertForSequenceClassification, AdamW, DistilBertConfig

"""
Labels:
1 - World
2 - Sports 
3 - Business 
4 - Sci/Tech
"""
NUM_LABELS = 4

# DistilBERT (pre-trained, standard 12 layers)
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=NUM_LABELS,
    output_attentions = False,
    output_hidden_states = False,
).cuda()





Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classi

## Setup Optimiser and Scheduler 

In [9]:
from transformers import get_linear_schedule_with_warmup

EPOCHS = 2
TOTAL_STEPS = len(train_dataloader) * EPOCHS

optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = TOTAL_STEPS)




## Helper Functions

In [10]:
import numpy as np 
import time
import datetime

#Calculate the accuracy of predictions vs label
def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
  #Round to nearest second
  elapsed_rounded = int(round((elapsed)))

  #Format as hh:mm:ss
  return str(datetime.timedelta(seconds=elapsed_rounded))

#Preparation for saving our model

In [11]:
from google.colab import drive
drive.mount('/content/gdrive')
model_save_name = 'classifier.pt'
PATH = F"/content/gdrive/MyDrive/0db/{model_save_name}" 

Mounted at /content/gdrive


# Fine Tune Model - Main training loop




In [12]:
import random
import numpy as np

def should_print_batch_update(current_step, base_time):
  return (step % 40 == 0 and not step == 0)
    
def print_batch_update(current_step, base_time):
  elapsed = format_time(time.time() - base_time)
  print("Batch {:>5} of {:>5}. Elapsed: {:}.".format(step, 
      len(train_dataloader), elapsed))

"""
We will set a seed for the random functions in order
to allow for reproducible results.
"""
seed_val = 100
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []

initial_time = time.time()


for epoch_i in range(0, EPOCHS):
  print("\n\n==== Epoch {:} / {:} ====".format(epoch_i + 1, EPOCHS))

  base_time = time.time()
  total_train_loss = 0

  #Set the model to train mode
  #Note that this does not train the model itself
  model.train()

  #Iterate through our training data
  for step, batch in enumerate(train_dataloader):

    #Progress update every X steps
    if should_print_batch_update(step, base_time):
      print_batch_update(step, base_time)
    
    """
    We would like to retrieve each component of our training batch.
    Each batch contains:
    - Input IDs
    - An Attention Mask
    - Labels
    """
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    model.zero_grad()

    forward_pass = model(input_ids=b_input_ids,
                         attention_mask=b_input_mask,
                         labels=b_labels)
    
    loss = forward_pass.loss
    logits = forward_pass.logits

    #print("Train loss: {}".format(loss.item()))
    total_train_loss += loss.item()

    #Perform a backward pass to calculate the gradients
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    #Update parameters and take a step using the computed gradient
    optimizer.step()

    #Update the learning rate
    scheduler.step()
    
    # we could save model state here - in the form of checkpoints - if needed          
    
  #Calculate average loss over all of the batches
  avg_training_loss = total_train_loss / len(train_dataloader)

  #Measure how long this epoch took
  training_time = format_time(time.time() - base_time)

  print("")
  print("Average training loss: {0:.2f}".format(avg_training_loss))
  print("Training epoch took: {:}".format(training_time))

  base_time = time.time()

  #Set the model to evaluation mode.
  model.eval()

  #Initialise our evaluation variables.
  total_eval_accuracy = 0
  total_eval_loss = 0
  nb_eval_steps = 0

  for batch in validation_dataloader:
    """
    We would like to retrieve each component of our validation batch.
    Each batch contains:
    - Input IDs
    - An Attention Mask
    - Labels
    """
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():
      forward_pass = model(input_ids = b_input_ids,
                           attention_mask = b_input_mask,
                           labels = b_labels)
    
    loss = forward_pass.loss
    logits = forward_pass.logits
      
    #Accumulate the validation loss
    total_eval_loss += loss.item()

    #Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    total_eval_accuracy += flat_accuracy(logits, label_ids)

  #Report final accuracy for this validation run
  avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
  print("Accuracy: {0:2f}".format(avg_val_accuracy))

  #Calculate the average loss over all of the batches
  avg_val_loss = total_eval_loss / len(validation_dataloader)

  #Measure how long the validation run took
  validation_time = format_time(time.time() - base_time)

  print("Validation Loss: {0:.2f}".format(avg_val_loss))
  print("Validation Took: {:}".format(validation_time))

  #Store our training statistics
  training_stats.append(
      {
        'epoch': epoch_i + 1,
        'Training Loss': avg_training_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accur.': avg_val_accuracy,
        'Training Time': training_time,
        'Validation Time': validation_time
      }
  )

  print("\nTraining complete!")
  print("Total training took {:} (h:mm:ss)".format(format_time(
      time.time()-initial_time)))
  
  #Save trained model  
  torch.save(model.state_dict(), PATH)



==== Epoch 1 / 2 ====
Batch    40 of   563. Elapsed: 0:00:04.
Batch    80 of   563. Elapsed: 0:00:06.
Batch   120 of   563. Elapsed: 0:00:08.
Batch   160 of   563. Elapsed: 0:00:10.
Batch   200 of   563. Elapsed: 0:00:11.
Batch   240 of   563. Elapsed: 0:00:13.
Batch   280 of   563. Elapsed: 0:00:15.
Batch   320 of   563. Elapsed: 0:00:17.
Batch   360 of   563. Elapsed: 0:00:18.
Batch   400 of   563. Elapsed: 0:00:20.
Batch   440 of   563. Elapsed: 0:00:22.
Batch   480 of   563. Elapsed: 0:00:24.
Batch   520 of   563. Elapsed: 0:00:25.
Batch   560 of   563. Elapsed: 0:00:27.

Average training loss: 0.66
Training epoch took: 0:00:27
Accuracy: 0.841270
Validation Loss: 0.47
Validation Took: 0:00:00

Training complete!
Total training took 0:00:28 (h:mm:ss)


==== Epoch 2 / 2 ====
Batch    40 of   563. Elapsed: 0:00:02.
Batch    80 of   563. Elapsed: 0:00:04.
Batch   120 of   563. Elapsed: 0:00:05.
Batch   160 of   563. Elapsed: 0:00:07.
Batch   200 of   563. Elapsed: 0:00:09.
Batch   24