In [1]:
import re
import torch
import random
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
# import the needed classes
# we are doing a classification task
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
# download the data here
data_dir = '/home/vijjus/datasets/'

In [3]:
# set seeds for reproduceability
random.seed(24)
np.random.seed(24)
torch.manual_seed(24)
torch.cuda.manual_seed_all(24)

In [4]:
# import the tokenizer for the model in use
# this tokenizer is used to tokenize the input before numerical
# vectors are created
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [5]:
# read in the dataset
m_data = pd.read_csv(data_dir + 'IMDBDataset.csv')

In [6]:
# check the stats on this dataset
m_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [7]:
# look at a few examples
m_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
# removing the html parts, special characters etc
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    return text

In [9]:
#Apply function on review column
m_data['review']=m_data['review'].apply(denoise_text)

In [10]:
# verify that the data looks cleaner now
m_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production The filming tech...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [11]:
# since sentiment is either 
y = [1 if x == 'positive' else 0 for x in m_data['sentiment']]

In [12]:
# set to maximum size of BERT input vector
# Note: BERT supports a maxumum of 512. However, using
# a larger size may lead to memory issues.
max_length = 64

In [13]:
# let's work with a subset of data for ease of operation
sample_size = 5000
sample_data = m_data[:sample_size]
sample_y = y[:sample_size]

In [14]:
# lists for reviews, tokens, labels, etc.
# Note: BERT requires a [CLS] token at the start, and [PAD] tokens
# as fillers
def create_dataset(sample_features):
    reviewTokenList = []

    # lists for BERT input
    bertreviewIDs = []
    bertMasks = []
    bertSequenceIDs = []

    total_len = len(sample_data)

    for i in tqdm_notebook(range(total_len)):
    
        # always start with [CLS] tokens
        reviewTokens = ['[CLS]']
    
        text = sample_data.iloc[i]['review'].lower()
    
        words = text.split(" ")
    
        for word in words:
            reviewTokens += tokenizer.tokenize(word)
    
        if (len(reviewTokens) >= max_length):
            reviewTokens = reviewTokens[:max_length-1]
    
        reviewTokens += ['[PAD]'] * (max_length - len(reviewTokens))
    
        reviewTokenList.append(reviewTokens)

        reviewLength = min(max_length - 1, len(reviewTokens))
        bertreviewIDs.append(tokenizer.convert_tokens_to_ids(reviewTokens))
        bertMasks.append([1] * (reviewLength + 1) + [0] * (max_length - 1 - reviewLength))
        bertSequenceIDs.append([0] * (max_length))
        
    return reviewTokenList, bertreviewIDs, bertMasks, bertSequenceIDs

In [15]:
# create the numerical vectors for training
reviewTokenList, bertreviewIDs, bertMasks, bertSequenceIDs = create_dataset(sample_data)

HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [16]:
# lets take a look at the first tokenized review
# Note that BERT uses a WordPiece tokenizer (hence the ##)
reviewTokenList[1]

['[CLS]',
 'a',
 'wonderful',
 'little',
 'production',
 'the',
 'filming',
 'technique',
 'is',
 'very',
 'una',
 '##ss',
 '##uming',
 'very',
 'old',
 '##time',
 '##bb',
 '##c',
 'fashion',
 'and',
 'gives',
 'a',
 'comforting',
 'and',
 'sometimes',
 'discomfort',
 '##ing',
 'sense',
 'of',
 'realism',
 'to',
 'the',
 'entire',
 'piece',
 'the',
 'actors',
 'are',
 'extremely',
 'well',
 'chosen',
 'michael',
 'sheen',
 'not',
 'only',
 'has',
 'got',
 'all',
 'the',
 'polar',
 '##i',
 'but',
 'he',
 'has',
 'all',
 'the',
 'voices',
 'down',
 'pat',
 'too',
 'you',
 'can',
 'truly',
 'see',
 '[PAD]']

In [17]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor(bertreviewIDs, dtype=torch.long)
all_input_mask = torch.tensor(bertMasks, dtype=torch.long)
all_segment_ids = torch.tensor(bertSequenceIDs, dtype=torch.long)
all_labels = torch.tensor(sample_y, dtype=torch.long)

In [18]:
from torch.utils.data import TensorDataset, RandomSampler, DataLoader, SequentialSampler
from tqdm import trange
from torch.optim import Adam

In [19]:
train_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_labels)

In [20]:
batch_size = 32
max_steps = len(bertreviewIDs) // batch_size
learning_rate = 5e-5
cache_dir="model"
num_train_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logging_steps = 10
gradient_accumulation_steps = 1

In [21]:
device

device(type='cuda')

In [22]:
# instantiate the pre-trained model from HugginFace
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      cache_dir=cache_dir,
                                                      num_labels=2)

In [23]:
# initialize optimizer
param_optimizer = list(model.named_parameters())

# the following command removes the two pooler layers from the optimizer list
param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
]

# initialize Adam optimizer with set parameters
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [24]:
# create a random sampler for training using our dataset
train_sampler = RandomSampler(train_dataset)

In [25]:
# create a dataloader, using the random sampler from earlier
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

In [26]:
# look at the entire BERT model to see layers & dimensions
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [27]:
# Train!
print("***** Running training *****")
print("  Num examples = ", len(train_dataset))
print("  Num Epochs = ", num_train_epochs)
print("  Total optimization steps = ", max_steps)

global_step = 0
tr_loss, logging_loss = 0.0, 0.0

# initialize the gradiants
#model.zero_grad()

# switch to CUDA, if available
model.to(device)

# actual training loop
for epoch in trange(int(num_train_epochs), desc="Epoch"):
    # load batches from the dataloader iterator
    for step, batch in enumerate(train_dataloader):
        # load batches to the GPU and unpack to variables
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        # run the forward pass through the network
        outputs = model(input_ids, input_mask, segment_ids, labels=label_ids)
        # output is a tuple, with loss as the first element
        loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
        # back-prop the loss through the network
        loss.backward()
        # accumulate the loss
        tr_loss += loss.item()
        # every N steps (default 1) update the gradiants        
        if (step + 1) % gradient_accumulation_steps == 0:
            # upcade the weights, using the gradients
            optimizer.step()
            # and then zero the gradients
            optimizer.zero_grad()
            # this is one global step of our training
            global_step += 1
            # periodically, we log the loss
            if global_step % logging_steps == 0:
                print("Step: {} Loss: {:.3}".format(global_step, (tr_loss - logging_loss)/logging_steps))
                logging_loss = tr_loss

***** Running training *****
  Num examples =  5000
  Num Epochs =  3
  Total optimization steps =  156


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Step: 10 Loss: 0.714
Step: 20 Loss: 0.696
Step: 30 Loss: 0.665
Step: 40 Loss: 0.645
Step: 50 Loss: 0.647
Step: 60 Loss: 0.568
Step: 70 Loss: 0.537
Step: 80 Loss: 0.528
Step: 90 Loss: 0.494
Step: 100 Loss: 0.488
Step: 110 Loss: 0.458
Step: 120 Loss: 0.448
Step: 130 Loss: 0.452
Step: 140 Loss: 0.47
Step: 150 Loss: 0.473


Epoch:  33%|███▎      | 1/3 [02:01<04:03, 121.93s/it]

Step: 160 Loss: 0.37
Step: 170 Loss: 0.389
Step: 180 Loss: 0.347
Step: 190 Loss: 0.371
Step: 200 Loss: 0.344
Step: 210 Loss: 0.41
Step: 220 Loss: 0.308
Step: 230 Loss: 0.298
Step: 240 Loss: 0.342
Step: 250 Loss: 0.274
Step: 260 Loss: 0.322
Step: 270 Loss: 0.234
Step: 280 Loss: 0.332
Step: 290 Loss: 0.308
Step: 300 Loss: 0.313
Step: 310 Loss: 0.306


Epoch:  67%|██████▋   | 2/3 [04:03<02:01, 121.91s/it]

Step: 320 Loss: 0.285
Step: 330 Loss: 0.19
Step: 340 Loss: 0.175
Step: 350 Loss: 0.19
Step: 360 Loss: 0.191
Step: 370 Loss: 0.131
Step: 380 Loss: 0.215
Step: 390 Loss: 0.182
Step: 400 Loss: 0.188
Step: 410 Loss: 0.152
Step: 420 Loss: 0.158
Step: 430 Loss: 0.157
Step: 440 Loss: 0.222
Step: 450 Loss: 0.154
Step: 460 Loss: 0.194
Step: 470 Loss: 0.189


Epoch: 100%|██████████| 3/3 [06:05<00:00, 121.90s/it]


## Evaluating the Model ##

In [28]:
# let's evaluate with a subset of data for ease of operation
sample_size = 1000
sample_data = m_data[-sample_size:]
sample_y = y[-sample_size:]

In [29]:
# create the numerical vectors for evaluation
reviewTokenList, bertreviewIDs, bertMasks, bertSequenceIDs = create_dataset(sample_data)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [30]:
# Convert to Tensors and build dataset
all_input_ids = torch.tensor(bertreviewIDs, dtype=torch.long)
all_input_mask = torch.tensor(bertMasks, dtype=torch.long)
all_segment_ids = torch.tensor(bertSequenceIDs, dtype=torch.long)
all_labels = torch.tensor(sample_y, dtype=torch.long)

In [31]:
# bundle the tensors into a dataset
eval_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_labels)

In [32]:
# initialize sampler
eval_sampler = SequentialSampler(eval_dataset)

In [33]:
# create a dataloader, using the random sampler from earlier
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=batch_size)

In [34]:
from sklearn.metrics import accuracy_score

In [35]:
# set the model to evaluation mode (parameters are frozen)
model.eval()

eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

# load batches from dataloader and unpack to variables
for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
    # load variables to GPU
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    
    with torch.no_grad():
        #run model
        outputs = model(input_ids, segment_ids, input_mask)
    
    logits = outputs[0]
    y_pred = []
    y_true = label_ids.detach().cpu().tolist()
    for i, label in enumerate(label_ids):
        v = np.argmax(logits[i].detach().cpu().numpy())
        y_pred.append(v)
    
    tmp_eval_accuracy = accuracy_score(y_true, y_pred)
    
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += input_ids.size(0)
    nb_eval_steps += 1

eval_accuracy = eval_accuracy / nb_eval_steps
print("Eval examples: {} steps: {}, accuracy={:.2}".format(nb_eval_examples, nb_eval_steps, eval_accuracy))

Eval examples: 1000 steps: 32, accuracy=0.81
