# BERT Exploration Series

### Twitter Disaster Analysis
### Version 01
Kaggle Link: https://www.kaggle.com/c/nlp-getting-started/

### Summary
- Use Huggingface's transformers library
- Convert text using BERT tokenizer line by line
- Add dense layer to the end of pooling layer
- Use masking and padding
- 

In [1]:
# check GPU
!nvidia-smi

Tue Feb 18 03:08:09 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.01    Driver Version: 418.87.01    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla V100-SXM2...  On   | 00000000:00:1E.0 Off |                    0 |
| N/A   40C    P0    25W / 300W |      0MiB / 16130MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage    

In [2]:
import os
import sys
from pathlib import Path
import string
import re

import numpy as np
import pandas as pd
import torch
from transformers import BertModel, BertTokenizer

In [3]:
CURRENT_DIR = Path.cwd()
DATA_DIR = CURRENT_DIR.parent / 'data'

In [5]:
# read train set and test set
test = pd.read_csv(str(DATA_DIR / 'test.csv'))
train = pd.read_csv(str(DATA_DIR / 'train.csv'))

In [6]:
# load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [8]:
# define model
class Model(torch.nn.Module):
    def __init__(self, ):
        super(Model, self).__init__()
        # pre-trained BERT model by HuggingFace
        self.base_model = BertModel.from_pretrained('bert-base-uncased')
        # add dense layer to the end
        self.fc1 = torch.nn.Linear(768, 1)
        
    def forward(self, ids, masks):
        x = self.base_model(ids, attention_mask=masks)[1]
        x = self.fc1(x)
        return x

In [9]:
# Torch define device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [10]:
# load model and put on GPU
model = Model()
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=361.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Model(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [11]:
def bert_encode(text, tokenizer, max_len=160):
    """ BERT encoder for text 
    
    Return:
    @tokens: input token ids with 0s padding
    @pad_masks: 1 for inputs and 0 for paddings
    """
    # tokenize text using BERT tokenizer
    text = tokenizer.tokenize(text)
    # remove 2 tokens for start and end token
    text = text[:max_len-2]
    # add start and end token
    input_sequence = ["[CLS]"] + text + ["[SEP]"]
    # convert token to token_id
    tokens = tokenizer.convert_tokens_to_ids(input_sequence)
    # the rest of max_len need to be pad
    pad_len = max_len - len(input_sequence)
    # padding to max_len
    tokens += [0] * pad_len
    # masking, 1 for inputs and 0 for paddings
    pad_masks = [1] * len(input_sequence) + [0] * pad_len
    return tokens, pad_masks

In [12]:
def clean_text(text):
    """ Basic text cleaning """
    text = text.lower()
    text = re.sub(r'[^a-z0-9] \n', '', text)
    return text

In [13]:
# Use first 6000 for training, rest for validation
train_text = train.text[:6000]
val_text = train.text[6000:]
# clean text
train_text = train_text.apply(clean_text)
val_text = val_text.apply(clean_text)

In [14]:
# encode text and get mask
train_tokens = []
train_pad_masks = []
for text in train_text:
    tokens, masks = bert_encode(text)
    train_tokens.append(tokens)
    train_pad_masks.append(masks)
    
train_tokens = np.array(train_tokens)
train_pad_masks = np.array(train_pad_masks)

In [15]:
# same for validation
val_tokens = []
val_pad_masks = []
for text in val_text:
    tokens, masks = bert_encode(text)
    val_tokens.append(tokens)
    val_pad_masks.append(masks)
    
val_tokens = np.array(val_tokens)
val_pad_masks = np.array(val_pad_masks)

In [16]:
# build pytorch dataset
class Dataset(torch.utils.data.Dataset):
    
    def __init__(self, train_tokens, train_pad_masks, targets):
        
        super(Dataset, self).__init__()
        self.train_tokens = train_tokens
        self.train_pad_masks = train_pad_masks
        self.targets = targets
        
    def __getitem__(self, index):
        
        tokens = self.train_tokens[index]
        masks = self.train_pad_masks[index]
        target = self.targets[index]
        
        return (tokens, masks), target
    
    def __len__(self,):
        
        return len(self.train_tokens)

In [17]:
# build training dataset
train_dataset = Dataset(
                    train_tokens=train_tokens,
                    train_pad_masks=train_pad_masks,
                    targets=train.target[:6000]
)

In [18]:
# define hyperparameters
batch_size = 12
EPOCHS = 2

In [19]:
# build training dataloader
train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

In [20]:
# define loss and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# Use Adam Optimizer with learning rate of 0.00001
opt = torch.optim.Adam(model.parameters(), lr=0.00001)

In [21]:
# train model
model.train()
y_preds = []

# train 2 epochs
for epoch in range(EPOCHS):

    for i, ((tokens, masks), target) in enumerate(train_dataloader):

        y_pred = model(
                    tokens.long().to(device), 
                    masks.long().to(device)
                )
        loss = criterion(y_pred, target[:, None].float().to(device))
        opt.zero_grad()
        loss.backward()
        opt.step()
        # print("Step:", i)
    print('\rEpoch: %d/%d, %f%% loss: %0.2f'% (epoch+1, EPOCHS, (i+1)/len(train_dataloader)*100, loss.item()), end='')
    print()

Step: 0
Step: 1
Step: 2
Step: 3
Step: 4
Step: 5
Step: 6
Step: 7
Step: 8
Step: 9
Step: 10
Step: 11
Step: 12
Step: 13
Step: 14
Step: 15
Step: 16
Step: 17
Step: 18
Step: 19
Step: 20
Step: 21
Step: 22
Step: 23
Step: 24
Step: 25
Step: 26
Step: 27
Step: 28
Step: 29
Step: 30
Step: 31
Step: 32
Step: 33
Step: 34
Step: 35
Step: 36
Step: 37
Step: 38
Step: 39
Step: 40
Step: 41
Step: 42
Step: 43
Step: 44
Step: 45
Step: 46
Step: 47
Step: 48
Step: 49
Step: 50
Step: 51
Step: 52
Step: 53
Step: 54
Step: 55
Step: 56
Step: 57
Step: 58
Step: 59
Step: 60
Step: 61
Step: 62
Step: 63
Step: 64
Step: 65
Step: 66
Step: 67
Step: 68
Step: 69
Step: 70
Step: 71
Step: 72
Step: 73
Step: 74
Step: 75
Step: 76
Step: 77
Step: 78
Step: 79
Step: 80
Step: 81
Step: 82
Step: 83
Step: 84
Step: 85
Step: 86
Step: 87
Step: 88
Step: 89
Step: 90
Step: 91
Step: 92
Step: 93
Step: 94
Step: 95
Step: 96
Step: 97
Step: 98
Step: 99
Step: 100
Step: 101
Step: 102
Step: 103
Step: 104
Step: 105
Step: 106
Step: 107
Step: 108
Step: 109
Step: 110


Step: 338
Step: 339
Step: 340
Step: 341
Step: 342
Step: 343
Step: 344
Step: 345
Step: 346
Step: 347
Step: 348
Step: 349
Step: 350
Step: 351
Step: 352
Step: 353
Step: 354
Step: 355
Step: 356
Step: 357
Step: 358
Step: 359
Step: 360
Step: 361
Step: 362
Step: 363
Step: 364
Step: 365
Step: 366
Step: 367
Step: 368
Step: 369
Step: 370
Step: 371
Step: 372
Step: 373
Step: 374
Step: 375
Step: 376
Step: 377
Step: 378
Step: 379
Step: 380
Step: 381
Step: 382
Step: 383
Step: 384
Step: 385
Step: 386
Step: 387
Step: 388
Step: 389
Step: 390
Step: 391
Step: 392
Step: 393
Step: 394
Step: 395
Step: 396
Step: 397
Step: 398
Step: 399
Step: 400
Step: 401
Step: 402
Step: 403
Step: 404
Step: 405
Step: 406
Step: 407
Step: 408
Step: 409
Step: 410
Step: 411
Step: 412
Step: 413
Step: 414
Step: 415
Step: 416
Step: 417
Step: 418
Step: 419
Step: 420
Step: 421
Step: 422
Step: 423
Step: 424
Step: 425
Step: 426
Step: 427
Step: 428
Step: 429
Step: 430
Step: 431
Step: 432
Step: 433
Step: 434
Step: 435
Step: 436
Step: 437


In [22]:
# build validation dataset and dataloader
val_dataset = Dataset(
                    train_tokens=val_tokens,
                    train_pad_masks=val_pad_masks,
                    targets=train.target[6000:].reset_index(drop=True)
)
val_dataloader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=3, shuffle=False)

In [23]:
# define accuracy metric
def accuracy(y_actual, y_pred):
    y_ = y_pred > 0
    return np.sum(y_actual == y_).astype('int') / y_actual.shape[0]

In [24]:
# evaluate model on val dataset
model.eval()
avg_acc = 0
for i, ((tokens, masks), target) in enumerate(val_dataloader):

    y_pred = model(
                tokens.long().to(device), 
                masks.long().to(device), 
            )
    loss = criterion(y_pred,  target[:, None].float().to(device))
    acc = accuracy(target.cpu().numpy(), y_pred.detach().cpu().numpy().squeeze())
    avg_acc += acc
    print('\r%0.2f%% loss: %0.2f, accuracy %0.2f'% (i/len(val_dataloader)*100, loss.item(), acc), end='')
print('\nAverage accuracy: ', avg_acc / len(val_dataloader))

99.81% loss: 0.02, accuracy 1.00
Average accuracy:  0.8203221809169773


In [25]:
# define test dataset
class TestDataset(torch.utils.data.Dataset):
    
    def __init__(self, test_tokens, test_pad_masks):
        
        super(TestDataset, self).__init__()
        self.test_tokens = test_tokens
        self.test_pad_masks = test_pad_masks
        
    def __getitem__(self, index):
        
        tokens = self.test_tokens[index]
        masks = self.test_pad_masks[index]
        
        return (tokens, masks)
    
    def __len__(self,):
        
        return len(self.test_tokens)

In [26]:
# encode test text and get mask
test_tokens = []
test_pad_masks = []
for text in test.text:
    tokens, masks = bert_encode(text)
    test_tokens.append(tokens)
    test_pad_masks.append(masks)
    
test_tokens = np.array(test_tokens)
test_pad_masks = np.array(test_pad_masks)

In [27]:
# build test dataset and dataloader
test_dataset = TestDataset(
    test_tokens=test_tokens,
    test_pad_masks=test_pad_masks
)
test_dataloader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=3, shuffle=False)

In [28]:
# get result from test dataset
model.eval()
y_preds = []
for (tokens, masks) in test_dataloader:

    y_pred = model(
                tokens.long().to(device), 
                masks.long().to(device), 
            )
    y_preds += y_pred.detach().cpu().numpy().squeeze().tolist()

In [30]:
# get submission dataframe
submission_df = pd.read_csv(str(DATA_DIR / 'sample_submission.csv'))
submission_df['target'] = (np.array(y_preds) > 0).astype('int')
print(submission_df.target.value_counts())
print(submission_df.head())

0    1876
1    1387
Name: target, dtype: int64
   id  target
0   0       1
1   2       1
2   3       1
3   9       1
4  11       1


In [31]:
# output csv
submission_df.to_csv(str(DATA_DIR / 'submission.csv'), index=False)