# Data Preparation

## 1.1 Using Colab GPU for Training

In [None]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P4


## 1.2 Installing the Hugging Face Library

In [None]:
!pip install transformers



## 1.3 Loading Data

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/My Drive/GeSumGenEval

/content/drive/.shortcut-targets-by-id/1PcWXs_So5sTaP0wBAR77_ORSfd4aFtHq/GeSumGenEval


In [None]:
import json
from functools import reduce

in_json = {}
summaries = []
crowd_scores = {'coherence': [], 'consistency': [], 'fluency': [], 'relevance': []}
expert_scores = {'coherence': [], 'consistency': [], 'fluency': [], 'relevance': []}
with open("data/transl_20210228a.json", "r") as json_file:
    in_json = json.load(json_file)
    json_file.close()

for i in range(len(in_json)):
    json_data = in_json[i]

    expert_anno = reduce(lambda x, y: dict((k, v + y[k]) for k, v in x.items()), json_data["expert_annotations"])
    expert_scores['coherence'].append(expert_anno['coherence']/3 - 1)
    expert_scores['consistency'].append(expert_anno['consistency']/3 - 1)
    expert_scores['fluency'].append(expert_anno['fluency']/3 - 1)
    expert_scores['relevance'].append(expert_anno['relevance']/3 - 1)

    crowd_anno = reduce(lambda x, y: dict((k, v + y[k]) for k, v in x.items()), json_data["turker_annotations"])
    crowd_scores['coherence'].append(crowd_anno['coherence']/5 - 1)
    crowd_scores['consistency'].append(crowd_anno['consistency']/5 - 1)
    crowd_scores['fluency'].append(crowd_anno['fluency']/5 - 1)
    crowd_scores['relevance'].append(crowd_anno['relevance']/5 - 1)

    summaries.append(json_data['summ_marian_en_de'])

assert len(summaries) == 1600
assert len(expert_scores['coherence']) == 1600
assert len(crowd_scores['coherence']) == 1600

In [None]:
# Heavy influence of value 4 in Expert Fluency scores for training
print(len(expert_scores['fluency']))
j2 = [i for i in expert_scores['fluency'] if i == 4]
print(len(j2))

1600
1150


In [None]:
sentences = np.array(summaries)
sentences

array(['Paul Merson wurde mit nur sieben Minuten in seinem Team bleiben 0:0 Unentschieden mit Burnley gebracht. In der 89. Minute erzielte andros Townsend den Tottenham-Mittelfeldspieler. paul merson hatte eine weitere Grabung bei andros townsend nach seinem Aussehen. Der Mittelfeldspieler wurde letzte Woche in die England-Mannschaft gebracht. Klicken Sie hier für alle aktuellen Arsenal News.',
       "paul merson hat seine Reihe mit andros townsend neu gestartet. Der Tottenham-Mittelfeldspieler wurde mit nur sieben Minuten verbleiben in seinem Team 0-0 Unentschieden mit Burnley gebracht. andros townsend punktet england's Ausgleich in ihrer 1-1 freundliche Ziehung mit italien in turin.",
       'paul merson hat seine Reihe mit andros townsend neu gestartet, nachdem die tottenham Mittelfeldspieler wurde auf mit nur sieben Minuten verbleiben in seinem Team 0-0 Draw mit Burnley am Sonntag gebracht. Die Stadt wurde in der 83. Minute für Tottenham, wie sie zog 0-0 gegen Burnley gebracht. to

In [None]:
#labels = np.floor(expert_scores["coherence"]).astype(int)
#model_save_path = 'models/cnndm/expert_coherence.pt'
#epochs = 4

#labels = np.floor(expert_scores["consistency"]).astype(int)
#model_save_path = 'models/cnndm/expert_consistency.pt'
#epochs = 3

#labels = np.floor(expert_scores["fluency"]).astype(int)
#model_save_path = 'models/cnndm/expert_fluency.pt'
#epochs = 4

#labels = np.floor(expert_scores["relevance"]).astype(int)
#model_save_path = 'models/cnndm/expert_relevance.pt'
#epochs = 3

#labels = np.floor(crowd_scores["coherence"]).astype(int)
#model_save_path = 'models/cnndm/crowd_coherence.pt'
#epochs = 3

#labels = np.floor(crowd_scores["consistency"]).astype(int)
#model_save_path = 'models/cnndm/crowd_consistency.pt'
#epochs = 2

#labels = np.floor(crowd_scores["fluency"]).astype(int)
#model_save_path = 'models/cnndm/crowd_fluency.pt'
#epochs = 4

labels = np.floor(crowd_scores["relevance"]).astype(int)
model_save_path = 'models/cnndm/crowd_relevance.pt'
epochs = 4

# Tokenization & Input Formatting

## 2.1 BERT Tokenizer

In [None]:
from transformers import BertTokenizer

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-german-dbmdz-cased')

Loading BERT tokenizer...


In [None]:
# Print the original sentence.
print(' Original: ', sentences[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

 Original:  Paul Merson wurde mit nur sieben Minuten in seinem Team bleiben 0:0 Unentschieden mit Burnley gebracht. In der 89. Minute erzielte andros Townsend den Tottenham-Mittelfeldspieler. paul merson hatte eine weitere Grabung bei andros townsend nach seinem Aussehen. Der Mittelfeldspieler wurde letzte Woche in die England-Mannschaft gebracht. Klicken Sie hier für alle aktuellen Arsenal News.
Tokenized:  ['Paul', 'Mer', '##son', 'wurde', 'mit', 'nur', 'sieben', 'Minuten', 'in', 'seinem', 'Team', 'bleiben', '0', ':', '0', 'Unentschieden', 'mit', 'Bur', '##nl', '##ey', 'gebracht', '.', 'In', 'der', '89', '.', 'Minute', 'erzielte', 'and', '##ros', 'Town', '##sen', '##d', 'den', 'Tot', '##ten', '##ham', '-', 'Mittelfeld', '##spieler', '.', 'pa', '##ul', 'mer', '##son', 'hatte', 'eine', 'weitere', 'Grab', '##ung', 'bei', 'and', '##ros', 'to', '##wn', '##sen', '##d', 'nach', 'seinem', 'Aussehen', '.', 'Der', 'Mittelfeld', '##spieler', 'wurde', 'letzte', 'Woche', 'in', 'die', 'England', '

## 2.2 Sentencses to IDs

The `tokenizer.encode` function combines multiple steps for us:
1. Split the sentence into tokens.
2. Add the special `[CLS]` and `[SEP]` tokens.
3. Map the tokens to their IDs.

Oddly, this function can perform truncating for us, but doesn't handle padding. 

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

Original:  Paul Merson wurde mit nur sieben Minuten in seinem Team bleiben 0:0 Unentschieden mit Burnley gebracht. In der 89. Minute erzielte andros Townsend den Tottenham-Mittelfeldspieler. paul merson hatte eine weitere Grabung bei andros townsend nach seinem Aussehen. Der Mittelfeldspieler wurde letzte Woche in die England-Mannschaft gebracht. Klicken Sie hier für alle aktuellen Arsenal News.
Token IDs: [102, 3728, 2776, 2006, 325, 212, 475, 3178, 2633, 153, 1182, 2117, 2839, 430, 853, 430, 23967, 212, 2054, 9611, 4508, 3981, 566, 259, 125, 12396, 566, 5921, 7035, 1257, 4707, 12174, 233, 30888, 190, 11891, 155, 4354, 232, 12532, 3680, 566, 26668, 320, 8933, 2006, 638, 261, 1633, 6648, 132, 282, 1257, 4707, 1512, 5308, 233, 30888, 333, 1182, 17856, 566, 351, 12532, 3680, 325, 3577, 2813, 153, 128, 5714, 232, 3204, 3981, 566, 15354, 286, 651, 231, 816, 6127, 26526, 9156, 566, 103]


## 2.3 Padding & Truncating

Pad and truncate our sequences so that they all have the same length, MAX_LEN.

First, what's the maximum sentence length in our dataset?

In [None]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

Max sentence length:  183


Given that, let's choose MAX_LEN = 200 and apply the padding.

In [None]:
# We'll borrow the `pad_sequences` utility function to do this.
from keras.preprocessing.sequence import pad_sequences

# Set the maximum sequence length.
# I've chosen 200 somewhat arbitrarily. It's slightly larger than the
# maximum training sentence length of 47...
MAX_LEN = 200

print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

print('\nDone.')


Padding/truncating all sentences to 200 values...

Padding token: "[PAD]", ID: 0

Done.


## 2.4 Attention Masks

The attention mask simply makes it explicit which tokens are actual words versus which are padding.

The BERT vocabulary does not use the ID 0, so if a token ID is 0, then it's padding, and otherwise it's a real token.

In [None]:
# Create attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

## 2.5 Training & Validation Split

Divide up our training set to use 80% for training and 20% for testing.

In [None]:
# Use train_test_split to split our data into train and validation sets for
# training
from sklearn.model_selection import train_test_split

# Use 80% for training and 20% for validation.
train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.2)
# Do the same for the masks.
train_masks, test_masks, _, _ = train_test_split(attention_masks, labels,
                                             random_state=2018, test_size=0.2)

## 2.6 Converting to PyTorch Data Types

Our model expects PyTorch tensors rather than numpy.ndarrays, so convert all of our dataset variables.

In [None]:
# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
train_inputs = torch.tensor(train_inputs)
test_inputs = torch.tensor(test_inputs)

train_labels = torch.tensor(train_labels)
test_labels = torch.tensor(test_labels)

train_masks = torch.tensor(train_masks)
test_masks = torch.tensor(test_masks)

We'll also create an iterator for our dataset using the torch DataLoader class. This helps save on memory during training because, unlike a for loop, with an iterator the entire dataset does not need to be loaded into memory.

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.

batch_size = 16

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

# Train Our Classification Model

## 3.1 BertForSequenceClassification

For this task, we first want to modify the pre-trained BERT model to give outputs for classification, and then we want to continue training the model on our dataset until that the entire model, end-to-end, is well-suited for our task. 

Thankfully, the huggingface pytorch implementation includes a set of interfaces designed for a variety of NLP tasks. Though these interfaces are all built on top of a trained BERT model, each has different top layers and output types designed to accomodate their specific NLP task.  

Here is the current list of classes provided for fine-tuning:
* BertModel
* BertForPreTraining
* BertForMaskedLM
* BertForNextSentencePrediction
* **BertForSequenceClassification** - The one we'll use.
* BertForTokenClassification
* BertForQuestionAnswering

The documentation for these can be found under [here](https://huggingface.co/transformers/v2.2.0/model_doc/bert.html).

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    'bert-base-german-dbmdz-cased', # Use the 12-layer BERT model for german
    num_labels = 5, # The number of output labels--5 for multiclass classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Tell pytorch to run this model on the GPU.
model.cuda()

Some weights of the model checkpoint at bert-base-german-dbmdz-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31102, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## 3.2 Optimizer & Learning Rate Scheduler

Now that we have our model loaded we need to grab the training hyperparameters from within the stored model.

For the purposes of fine-tuning, the authors recommend choosing from the following values:
- Batch size: 16, 32  (We chose 32 when creating our DataLoaders).
- Learning rate (Adam): 5e-5, 3e-5, 2e-5  (We'll use 2e-5).
- Number of epochs: 2, 3, 4  (We'll use 4).

The epsilon parameter `eps = 1e-8` is "a very small number to prevent any division by zero in the implementation" (from [here](https://machinelearningmastery.com/adam-optimization-algorithm-for-deep-learning/)).

You can find the creation of the AdamW optimizer in `run_glue.py` [here](https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L109).

In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [None]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
# epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

## 3.3 Training Loop

Below is our training loop. There's a lot going on, but fundamentally for each pass in our loop we have a trianing phase and a validation phase. At each pass we need to:

Training loop:
- Unpack our data inputs and labels
- Load data onto the GPU for acceleration
- Clear out the gradients calculated in the previous pass. 
    - In pytorch the gradients accumulate by default (useful for things like RNNs) unless you explicitly clear them out.
- Forward pass (feed input data through the network)
- Backward pass (backpropagation)
- Tell the network to update parameters with optimizer.step()
- Track variables for monitoring progress

Evalution loop:
- Unpack our data inputs and labels
- Load data onto the GPU for acceleration
- Forward pass (feed input data through the network)
- Compute loss on our validation data and track variables for monitoring progress

So please read carefully through the comments to get an understanding of what's happening. If you're unfamiliar with pytorch a quick look at some of their [beginner tutorials](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#sphx-glr-beginner-blitz-cifar10-tutorial-py) will help show you that training loops really involve only a few simple steps; the rest is usually just decoration and logging.  

Define a helper function for calculating accuracy.

In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

Helper function for formatting elapsed times.

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


We're ready for training.

In [None]:
import random

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
train_loss_values = []
eval_loss_values = []

# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        print(b_input_ids.shape)
        print(b_input_mask.shape)
        print(b_labels.shape)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    train_loss_values.append(avg_train_loss)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    #print("")
    #print("Running Validation...")

    #t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    #model.eval()

    # Tracking variables 
    #eval_loss, eval_accuracy = 0, 0
    #nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    #for batch in validation_dataloader:
        
        # Add batch to GPU
        #batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        #b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        #with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            #outputs = model(b_input_ids, 
                            #token_type_ids=None, 
                            #attention_mask=b_input_mask,
                            #labels=b_labels)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        #logits = outputs.logits
        #loss = outputs.loss

        # Move logits and labels to CPU
        #logits = logits.detach().cpu().numpy()
        #label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        #tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        #eval_accuracy += tmp_eval_accuracy

        # Accumulate the total loss
        #eval_loss += loss.item()

        # Track the number of batches
        #nb_eval_steps += 1

    # Store the loss value for plotting the learning curve.
    #avg_eval_loss = eval_loss/nb_eval_steps
    #eval_loss_values.append(avg_eval_loss)

    # Report the final accuracy for this validation run.
    #print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    #print("  Loss: {0:.2f}".format(avg_eval_loss))
    #print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")


Training...
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size([16, 200])
torch.Size([16, 200])
torch.Size([16])
torch.Size(

In [None]:
torch.save(model, model_save_path)

Let's take a look at our training loss over all batches:

In [None]:
#import matplotlib.pyplot as plt
#% matplotlib inline

#import seaborn as sns

# Use plot styling from seaborn.
#sns.set(style='darkgrid')

# Increase the plot size and font size.
#sns.set(font_scale=1.5)
#plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
#plt.plot(train_loss_values, 'b-o')

# Plot the learning curve.
#plt.plot(eval_loss_values, 'r-o')

# Label the plot.
#plt.title("Training/Validation loss")
#plt.xlabel("Epoch")
#plt.ylabel("Loss")

#plt.show()

#  Performance On Test Set

## 4.1 Data Preparation

In [None]:
import pandas as pd



# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                   )
    
    input_ids.append(encoded_sent)

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, 
                          dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

# Convert to tensors.
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 16  

# Create the DataLoader.
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

## 4.2 Evaluate on Test Set

In [None]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(test_inputs)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []
num_correct, num_samples = 0, 0

# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  pred_labels_i = np.argmax(logits, axis=1).flatten()
  num_correct += (pred_labels_i == label_ids).sum()
  num_samples += len(pred_labels_i)

  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')
print(f'Got {num_correct} / {num_samples} with accuracy {float(num_correct)/float(num_samples)*100:.2f}')

Predicting labels for 320 test sentences...
    DONE.
Got 171 / 320 with accuracy 53.44


In [None]:
from sklearn.metrics import matthews_corrcoef


matthews_set = []
prediction = []
true_label = []

# Evaluate each test batch using Matthew's correlation coefficient
print('Calculating Matthews Corr. Coef. for each batch...')

# For each input batch...
for i in range(len(true_labels)):
  
  # The predictions for this batch are a 2-column ndarray (one column for "0" 
  # and one column for "1"). Pick the label with the highest value and turn this
  # in to a list of 0s and 1s.
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  prediction.extend(pred_labels_i)
  true_label.extend(true_labels[i])
  
  # Calculate and store the coef for this batch.  
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
  matthews_set.append(matthews)



Calculating Matthews Corr. Coef. for each batch...


In [None]:
matthews_set

In [None]:
# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true_labels for item in sublist]

# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)

print('MCC: %.3f' % mcc)

MCC: 0.486


In [None]:
from scipy.stats.stats import pearsonr, spearmanr

pearson = pearsonr(true_label, prediction) 
spearman = spearmanr(true_label, prediction)   

In [None]:
pearson

(0.7171910453751473, 6.41650393022972e-253)

In [None]:
spearman

SpearmanrResult(correlation=0.7295666608883566, pvalue=5.818507977336141e-266)