In [None]:
# This practice is inspired by:
# https://github.com/CSCfi/machine-learning-scripts/blob/master/notebooks/pytorch-imdb-bert.ipynb
# https://github.com/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb

In [None]:
import torch
from torch.utils.data import (TensorDataset, DataLoader,
                              RandomSampler, SequentialSampler)
from torch.utils.tensorboard import SummaryWriter
from transformers import BertTokenizer, BertConfig
from transformers import BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup as WarmupLinearSchedule
from torch.optim import AdamW

from distutils.version import LooseVersion as LV

from sklearn.model_selection import train_test_split

import io

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

if torch.cuda.is_available():
    device = torch.device('cuda')
    devicename = '['+torch.cuda.get_device_name(0)+']'
else:
    device = torch.device('cpu')
    devicename = ""

<h1>IMDb Dataset</h1>

<h4>IMDb dataset is a widely used dataset for natural language processing (NLP) and sentiment analysis tasks. It consists of 25,000 highly polar movie reviews for training, and 25,000 for testing.
<a href="https://ai.stanford.edu/~amaas/data/sentiment/">https://ai.stanford.edu/~amaas/data/sentiment/</a></h4>
<h4>We write our own dataset downloading and extraction process here.</h4>

In [None]:
import os
import re
import requests
import tarfile
import subprocess

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with open(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))

    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    
    url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    fname = "aclImdb.tar.gz"

    dataset = "data/"

    # if downloaded:
    if (os.path.exists(os.path.join(dataset, fname[:fname.find('.')], 'train_df.p')) and 
        os.path.exists(os.path.join(dataset, fname[:fname.find('.')], 'test_df.p')) ):

        print("Dataset has already downloaded, loading catched dataset instead.")
        train_df = pd.read_pickle(os.path.join(dataset, fname[:fname.find('.')], 'train_df.p'))
        test_df = pd.read_pickle(os.path.join(dataset, fname[:fname.find('.')], 'test_df.p'))

    else:
        print("Downloading...")
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(os.path.join(dataset, fname), 'wb') as f:
                f.write(response.raw.read())
        else:
            raise Exception("Downloading error.")
        # untar
        print("Extracting...")
        subprocess.call(["tar", "xzf", os.path.join(dataset, fname), 
                         "-C", dataset])
        train_df = load_dataset(os.path.join(os.path.dirname(dataset),
                                             "aclImdb", "train"))
        test_df = load_dataset(os.path.join(os.path.dirname(dataset),
                                            "aclImdb", "test"))
        if os.path.exists(os.path.join(dataset, fname[:fname.find('.')], 'train_df.p')):
            os.remove(os.path.join(dataset, fname[:fname.find('.')], 'train_df.p'))
        if os.path.exists(os.path.join(dataset, fname[:fname.find('.')], 'test_df.p')):
            os.remove(os.path.join(dataset, fname[:fname.find('.')], 'test_df.p'))
        
        train_df.to_pickle("data/aclImdb/train_df.p")
        test_df.to_pickle("data/aclImdb/test_df.p")
        print("Dataset is downloaded.")
    
    return train_df, test_df

<h4>Call the dataloading function to load the dataset to Pandas dataframes.</h4>

In [None]:
train_df, test_df = download_and_load_datasets()

<h4>For this practice, we'll reduce the training data to make the demonstration more efficient.</h4>

In [None]:
train_df = train_df.sample(3200, random_state=1)
test_df = test_df.sample(3200, random_state=1)

<h4>
    Let's take a look of the first 10 samples in the train set.
</h4>

In [None]:
print('IMDB data loaded:')
print('train:', train_df.shape)
print('numbers of negative reviews vs positive reviews: \n', 
      len(train_df[train_df.polarity == 0]),
      " vs. ",
      len(train_df[train_df.polarity == 1]),
     )
print('test:', test_df.shape)
print('numbers of negative reviews vs positive reviews: \n', 
      len(test_df[test_df.polarity == 0]),
      " vs. ",
      len(test_df[test_df.polarity == 1]),
     )
train_df.sample(10)

<h4>
    Let's take a look of the data format in the dataset.
</h4>
<h5>The token [CLS] is a special token required by BERT at the beginning of the sentence.</h5>

In [None]:
class_names = {
    0: 'negative',
    1: 'positive',
}
sentences_train = train_df.sentence.values
sentences_train = ["[CLS] " + s for s in sentences_train]

sentences_test = test_df.sentence.values
sentences_test = ["[CLS] " + s for s in sentences_test]

labels_train = train_df.polarity.values
labels_test  = test_df.polarity.values

print ("The first training sentence:")
print()
print(sentences_train[0])
print()
print('LABEL:', class_names[labels_train[0]])
print()
print ("The first testing sentence:")
print()
print(sentences_test[0])
print()
print('LABEL:', class_names[labels_test[0]])

<h1>Data preparation</h1>

<h4>IMDB dataset is also a perfectly balanced. Please note that we randomly reduced both train and test sets to 3,200 samples. Thus, the above sample counts for the train set are slightly biased.</h4>
<h4>However, before using the BERT model, the text needs to be tokenized into a format it can understand. </h4>


<h4>
    Let's get the tokenizer of the BERT model. Then, tokenize the dataset.
</h4>

In [None]:
BERTMODEL='bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(BERTMODEL,
                                          do_lower_case=True)

tokenized_train = [tokenizer.tokenize(s) for s in sentences_train]
tokenized_test  = [tokenizer.tokenize(s) for s in sentences_test]

print ("The full tokenized first training sentence:")
print (tokenized_train[0])

<h4>BERT doesn’t understand raw text like humans do. Instead, it processes numerical representations of text. To bridge this gap, tokenization is necessary.</h4>
<h4>The above will take a little time as the tokenization needs to go through every symbols and words in the dataset.</h4>
<h4>
While we are waiting, let's see a little more details about BERT.
</h4> 
<h4>It needs the text input to have an ID and be splited by words and punctuations, so that the relationships between the words and punctuations can be learned.
</h4>
<img src="https://upload.wikimedia.org/wikipedia/commons/b/b5/BERT_embeddings_01.png" alt="BERT Encoder">'
<h4>
    In addition, there is an input length limit for the BERT model. So, next step, we need to truncate the formated tokens.
</h4>

In [None]:
MAX_LEN_TRAIN, MAX_LEN_TEST = 128, 512

tokenized_train = [t[:(MAX_LEN_TRAIN-1)]+['SEP'] for t in tokenized_train]
tokenized_test  = [t[:(MAX_LEN_TEST-1)]+['SEP'] for t in tokenized_test]

print ("The truncated tokenized first training sentence:")
print (tokenized_train[0])

<h4>
    Let's take a look into the IDs of the first training sentence.
</h4>

In [None]:
ids_train = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_train]
ids_train = np.array([np.pad(i, (0, MAX_LEN_TRAIN-len(i)), 
                             mode='constant') for i in ids_train])

ids_test = [tokenizer.convert_tokens_to_ids(t) for t in tokenized_test]
ids_test = np.array([np.pad(i, (0, MAX_LEN_TEST-len(i)), 
                            mode='constant') for i in ids_test])

print ("The indices of the training sentence:")
print (ids_train[15])

<h4>
    Why there are 0's in the IDs?
</h4>
<h4>
    All inputs must be the same length. However, not all sentences are equally length — so the 0's are the padding to make the length uniform.
</h4>

<h4>
    BERT uses self-attention, which means it learns to understand a sentence by using the sentence itself—predicting the masked parts based on the unmasked context. 
</h4>
<h4>
    Next, let’s create the attention masks for our dataset.
</h4>

In [None]:
amasks_train, amasks_test = [], []

for seq in ids_train:
  seq_mask = [float(i>0) for i in seq]
  amasks_train.append(seq_mask)
    
for seq in ids_test:
  seq_mask = [float(i>0) for i in seq]
  amasks_test.append(seq_mask)

print ("The masked training sentence:")
print (amasks_train[15])

<h4>
    Let's split the training set to create a validation set.
</h4>
<h4>
    Why?
</h4>
<h4>
    A validation set is used to evaluate the model during training, without touching the test set. It helps you answer:
    <br><br>
"How well is my model learning, and when should I stop training?"
</h4>

In [None]:
(train_inputs, validation_inputs, 
 train_labels, validation_labels) = train_test_split(ids_train, labels_train, 
                                                     random_state=42,
                                                     test_size=0.1)
(train_masks, validation_masks, 
 _, _) = train_test_split(amasks_train, ids_train,
                          random_state=42, test_size=0.1)

<h4>
    We have the data prepared now. Let's create the dataloader for all these sets to prepare for the next step -- training.
</h4>

In [None]:
# Convert all sets to tensors
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks  = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks  = torch.tensor(validation_masks)
test_inputs = torch.tensor(ids_test)
test_labels = torch.tensor(labels_test)
test_masks  = torch.tensor(amasks_test)

# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
BATCH_SIZE = 32

print('Train: ', end="")
train_data = TensorDataset(train_inputs, train_masks,
                           train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, 
                              batch_size=BATCH_SIZE)
print(len(train_data), 'reviews')

print('Validation: ', end="")
validation_data = TensorDataset(validation_inputs, validation_masks,
                                validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data,
                                   sampler=validation_sampler,
                                   batch_size=BATCH_SIZE)
print(len(validation_data), 'reviews')

print('Test: ', end="")
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler,
                             batch_size=BATCH_SIZE)
print(len(test_data), 'reviews')

<h1>Use a pre-trained BERT model.</h1>
<h4>
    Instead of training from scratch, we start with a pre-trained BERT model that has already learned a lot about the structure and meaning of language.
</h4>
<h5>
    For example, the model here (google-bert) was pretrained on BookCorpus, a dataset consisting of 11,038 unpublished books and English Wikipedia (excluding lists, tables and headers). By using a pre-trained model, we don’t need to train from scratch for it to understand grammar, context, and word relationships. Instead, we can simply fine-tune BERT on our own smaller dataset for a specific task.
</h5>

In [None]:
BERTMODEL='bert-base-uncased'
class_names = {
    0: 'negative',
    1: 'positive',
}

model = BertForSequenceClassification.from_pretrained(BERTMODEL, 
                                                      num_labels=2)

<h4>Move the model to GPU to prepare for training</h4>

In [None]:
model.cuda()

<h1>Train the model.</h1>
<h4>
    Let's configure the hyperparameters frist.
</h4>

In [None]:
# For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32.
BATCH_SIZE = 32
# Number of training epochs -- one full pass through the entire train set by the model
EPOCHS = 4
# Learning rate -- how much model learns for every batch of training
LR = 2e-5
# weight decay for the Adam optimizer
WEIGHT_DECAY = 0.01 

WARMUP_STEPS =int(0.2*len(train_dataloader))
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     'weight_decay': WEIGHT_DECAY},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8)
scheduler = WarmupLinearSchedule(optimizer, num_warmup_steps=WARMUP_STEPS,
                                 num_training_steps=len(train_dataloader)*EPOCHS)

<h4>Let's define the function for training and the function for validation.</h4>

In [None]:
def train(epoch, loss_vector=None, log_interval=15):
  # Set model to training mode
  model.train()
  
  # Loop over each batch from the training set
  for step, batch in enumerate(train_dataloader):

    # Copy data to GPU if needed
    batch = tuple(t.to(device) for t in batch)
    
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    
    # Zero gradient buffers
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None, 
                    attention_mask=b_input_mask, labels=b_labels)

    loss = outputs[0]
    if loss_vector is not None:
        loss_vector.append(loss.item())
        
    # start a TensorBoard summary writer to log testing
    writer = SummaryWriter("log/txt_cls/imdb_experiment")
    writer.add_scalar("Loss/train", loss.item(), ((epoch-1)*len(train_dataloader))+step )
    writer.close()
    
    # Backward pass
    loss.backward()
    
    # Update weights
    scheduler.step()
    optimizer.step()
    
    if step % log_interval == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, step * len(b_input_ids),
                len(train_dataloader.dataset),
                100. * step / len(train_dataloader), loss))

In [None]:
def evaluate(loader, epoch, validation = True):
    model.eval()
    
    n_correct, n_all = 0, 0
    
    for batch in loader:
      batch = tuple(t.to(device) for t in batch)
      b_input_ids, b_input_mask, b_labels = batch
      #print(b_input_ids.shape)

      with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask)
        logits = outputs[0]
    
      logits = logits.detach().cpu().numpy()
      predictions = np.argmax(logits, axis=1)

      labels = b_labels.to('cpu').numpy()
      n_correct += np.sum(predictions == labels)
      n_all += len(labels)

    if validation:
        # start a TensorBoard summary writer to log testing
        writer = SummaryWriter("log/txt_cls/imdb_experiment")
        writer.add_scalar("Accuracy/validation", n_correct/n_all, epoch)
    else:
        # start a TensorBoard summary writer to log testing
        writer = SummaryWriter("log/txt_cls/imdb_experiment")
        writer.add_scalar("Accuracy/test", n_correct/n_all, epoch)
    writer.close()
    print('Accuracy: [{}/{}] {:.4f}\n'.format(n_correct, n_all, n_correct/n_all))

<h4>
    Train the model.
</h4>

In [None]:
train_lossv = []
for epoch in range(1, EPOCHS + 1):
    train(epoch, train_lossv)
    print('\nValidation set:')
    evaluate(validation_dataloader, epoch)

<h4>
    Training performance.
</h4>

In [None]:
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_lossv, label='original')
plt.plot(np.convolve(train_lossv, np.ones(101), 'same') / 101, label='averaged')
plt.legend(loc='best')
plt.show()

<h1>Evaluate the trained model.</h1>
<h4>
    Noticed the test set was never used or even mentioned in any place?
</h4>
<h4>It's <b><u>crucial</u></b> that the testing set remains completely unseen throughout the training and validation phases to avoid data contamination for accurate evaluation.​</h4>
<h4>There are various metrics available to assess the effectiveness of a trained model. Selection often depends on the study case. Here, we evaluate only overall accuracy for simplicity and demonstration purposes.</h4>

In [None]:
# The evaluation function can also be used by the testing set. 
# But the testing set samples should alway reamin unseen during trianing.
print('Test set:')
evaluate(test_dataloader, EPOCHS, False)