## Install and Import

In [1]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
!pip install pytorch-pretrained-bert pytorch-nlp



In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [4]:
#### use GPU as device for Torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

## Load Dataset


In [5]:
webapps_train = pd.read_csv("webapps_train.csv")
webapps_valid = pd.read_csv("webapps_val.csv")
webapps_test = pd.read_csv("webapps_test.csv")

df = webapps_train.append(webapps_valid).reset_index(drop=True)

In [6]:
df.shape

(30, 2)

In [7]:
df.sample(10)

Unnamed: 0,text,intent
13,How do I change my password on TV Tropes?,0
21,Twitter and Echofon spam,4
26,"Does using Gmail's ""Never send it to Spam"" fil...",4
17,How to disable/delete a Harvest account?,1
25,Are there any good Veoh or Youtube alternatives?,5
9,Alternatives to Twitter,5
20,Google Bookmarks and Chrome Bookmark Sync -- D...,7
11,How can I sync my Yahoo! Calendar with Google ...,7
1,Change subject line in new Gmail compose window,6
12,How do I delete my Ohloh profile?,1


## Preprocessing

In [8]:
# import the BERT tokenizer, used to convert our text into tokens that correspond to BERT's vocabulary.

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


In [9]:
def pre_process_df(df):
  sents = df.text.values
  sents = ["[CLS] " + s + " [SEP]" for s in sents]
  labls = df.intent.values

  tokenized_texts = [tokenizer.tokenize(s) for s in sents]

  MAX_LEN = 28

  # Tokenizing
  input_ids_ = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

  # Padding
  input_ids = []
  for ids in input_ids_:
        ids = ids[:min(len(ids), MAX_LEN - 2)]
        ids = ids + [0] * (MAX_LEN - len(ids))
        input_ids.append(np.array(ids))
  input_ids = np.array(input_ids)


  # Create attention masks
  attention_masks = []

  # Create a mask of 1s for each token followed by 0s for padding
  for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 

  return input_ids, attention_masks, labls


## Inputs

In [33]:
input_ids, attention_masks, labels = pre_process_df(df)

# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2021, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2021, test_size=0.1)

In [34]:
# Convert all of our data into torch tensors, the required datatype for DL

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [35]:
# Select a batch size for training. 
batch_size = 1

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


## Train Model

For this task, we have loaded [BertForSequenceClassification](https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L1129). This is the normal BERT model with an added single linear layer on top for classification that we will use as a sentence classifier.


### Structure of Model

The additional layer added on top of the BERT model consists of untrained linear neurons of size [hidden_state, number_of_labels], so [768,8], meaning that the output of BERT plus our classification layer is a vector of 8 numbers representing the "score" for our labels that are then fed into cross-entropy loss.



In [36]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=8)
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
   

Now that we have our model loaded we need to grab the training hyperparameters from within the stored model.

For the purposes of fine-tuning, the authors recommend the following hyperparameter ranges:
- Batch size: 16, 32
- Learning rate (Adam): 5e-5, 3e-5, 2e-5
- Number of epochs: 2, 3, 4

In [37]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [38]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=2e-5,
                     warmup=.1)

t_total value of -1 results in schedule not being applied


# Training Loop

In [39]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [40]:
t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 5

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set the model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put the model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:  20%|██        | 1/5 [00:03<00:15,  3.77s/it]

Train loss: 2.1210919088787503
Validation Accuracy: 0.0


Epoch:  40%|████      | 2/5 [00:07<00:11,  3.75s/it]

Train loss: 1.8543440986562658
Validation Accuracy: 0.0


Epoch:  60%|██████    | 3/5 [00:11<00:07,  3.75s/it]

Train loss: 1.4798331437287506
Validation Accuracy: 0.0


Epoch:  80%|████████  | 4/5 [00:14<00:03,  3.75s/it]

Train loss: 1.0562017394436731
Validation Accuracy: 0.3333333333333333


Epoch: 100%|██████████| 5/5 [00:18<00:00,  3.74s/it]

Train loss: 0.8370019096743178
Validation Accuracy: 0.3333333333333333





##Predict and Evaluate on Test Set

Load the test set and look at confusion matrix and check some scores

In [41]:
test_df = pd.read_csv("webapps_test.csv")

test_ids, test_att, test_labels = pre_process_df(test_df)

prediction_inputs = torch.tensor(test_ids)
prediction_masks = torch.tensor(test_att)
prediction_labels = torch.tensor(test_labels)
  
batch_size = 1  


prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [42]:
# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

In [43]:
# Flatten the predictions and true values for aggregate evaluation on the whole dataset
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]


In [44]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [45]:
print(classification_report(flat_true_labels, flat_predictions))

              precision    recall  f1-score   support

           0       0.40      0.33      0.36         6
           1       0.53      1.00      0.69        10
           3       0.00      0.00      0.00         3
           4       0.67      1.00      0.80        14
           5       1.00      0.88      0.93        16
           6       0.00      0.00      0.00         4
           7       0.00      0.00      0.00         6

    accuracy                           0.68        59
   macro avg       0.37      0.46      0.40        59
weighted avg       0.56      0.68      0.60        59



  _warn_prf(average, modifier, msg_start, len(result))


In [46]:
print(confusion_matrix(flat_true_labels, flat_predictions))

[[ 2  2  0  2  0  0  0]
 [ 0 10  0  0  0  0  0]
 [ 0  2  0  1  0  0  0]
 [ 0  0  0 14  0  0  0]
 [ 0  1  0  1 14  0  0]
 [ 0  2  0  2  0  0  0]
 [ 3  2  0  1  0  0  0]]


In [47]:
print(accuracy_score(flat_true_labels, flat_predictions))

0.6779661016949152


In [48]:
print(f1_score(flat_true_labels, flat_predictions, average='weighted'))

0.5968085294794822
