# BERT Exploration Series

## Introduction

Twitter Disaster Analysis

**Version 02**

Kaggle Link: https://www.kaggle.com/c/nlp-getting-started/
#### Summary
- Based on version 01
- Use config file instead
- Add general preprocessing

## Setup

In [1]:
import os
import sys
from pathlib import Path
import string
import re

import numpy as np
import pandas as pd
import torch
from transformers import BertModel, BertTokenizer

In [2]:
# fix all random seeds
# borrow from: https://www.kaggle.com/bibek777/bert-baseline
import random

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [9]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

# configuration
config = Config(
    testing=False,
    model_name="bert-base-uncased",
    max_lr=1e-5,
    epochs=2,
    bs=12,
    discriminative=False,
    max_seq_len=256,
    sample_size=6000,
    path_to_dataset = DATA_DIR
)

### Data

In [3]:
CURRENT_DIR = Path.cwd()
DATA_DIR = CURRENT_DIR.parent / 'data'

In [47]:
# read train set and test set
test = pd.read_csv(str(DATA_DIR / 'test.csv'))
train = pd.read_csv(str(DATA_DIR / 'train.csv'))

## Preprocessing

### Check BERT tokenizer coverage

Reference
- [Pre-Processing for BERT](https://www.kaggle.com/imvivek14/pre-processing-for-bert)
- [How to: Preprocessing when using embeddings](https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings)

In [5]:
import operator

In [6]:
# tweets are unstructured and BERT encoder coverage is limited
df = pd.concat([train, test], ignore_index=True)
df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0
1,4,,,Forest fire near La Ronge Sask. Canada,1.0
2,5,,,All residents asked to 'shelter in place' are ...,1.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0
...,...,...,...,...,...
10871,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,
10872,10865,,,Storm in RI worse than last hurricane. My city...,
10873,10868,,,Green Line derailment in Chicago http://t.co/U...,
10874,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...,


In [7]:
# use lower case since our BERT model is uncased
df['text'] = df['text'].str.lower()

In [10]:
# Check BERT tokenizer coverage
# ref. https://www.kaggle.com/imvivek14/pre-processing-for-bert
tokenizer = BertTokenizer.from_pretrained(config.model_name)

In [11]:
# this buids the vocab. of our dataset
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab
vocab = build_vocab(df['text'])
list(vocab.keys())[:10]

['our',
 'deeds',
 'are',
 'the',
 'reason',
 'of',
 'this',
 '#earthquake',
 'may',
 'allah']

In [12]:
# this check how much of our vocab is similar to the BERT vocab.
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
    print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words

print("BERT")
oov_bert = check_coverage(vocab, tokenizer.vocab)

BERT
Found embeddings for 24.789% of vocab
Found embeddings for  71.851% of all text


In [13]:
oov_bert[:10]

[('&amp;', 429),
 ("i'm", 336),
 ('??', 304),
 ("it's", 259),
 ("don't", 221),
 ('????', 139),
 ("can't", 119),
 ('@youtube', 116),
 ('#news', 92),
 ("you're", 88)]

### Use TweetTokenizer

Reference:
- [BERT_fastai](https://www.kaggle.com/kpriyanshu256/bert-fastai/comments)

In [None]:
# !pip install nltk

In [14]:
from nltk.tokenize import TweetTokenizer
twt = TweetTokenizer(strip_handles=True)
def tweets(r):
    s = ' '.join(twt.tokenize(r['text']))
    s = re.sub(r'http\S+', '', s)
    s = re.sub(r'https\S+', '', s)    
    return s

In [15]:
df['ptext'] = df.apply(tweets, axis=1)

In [16]:
df.head()

Unnamed: 0,id,keyword,location,text,target,ptext
0,1,,,our deeds are the reason of this #earthquake m...,1.0,our deeds are the reason of this #earthquake m...
1,4,,,forest fire near la ronge sask. canada,1.0,forest fire near la ronge sask . canada
2,5,,,all residents asked to 'shelter in place' are ...,1.0,all residents asked to ' shelter in place ' ar...
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,"13,000 people receive #wildfires evacuation or..."
4,7,,,just got sent this photo from ruby #alaska as ...,1.0,just got sent this photo from ruby #alaska as ...


In [17]:
vocab = build_vocab(df['ptext'])

In [18]:
oov_bert = check_coverage(vocab, tokenizer.vocab)

Found embeddings for 48.485% of vocab
Found embeddings for  85.664% of all text


In [19]:
oov_bert[:10]

[('\x89', 1151),
 ('û_', 508),
 ("i'm", 350),
 ("it's", 273),
 ("don't", 229),
 ('..', 181),
 ('û', 131),
 ('ûªs', 125),
 ("can't", 122),
 ('#news', 98)]

### Further processing
Reference:
- [How to: Preprocessing when using embeddings](https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings/notebook)

In [21]:
# further cleaning
def clean_text(x):

    x = str(x)
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
    return x
df['ptext'] = df['ptext'].apply(clean_text)

In [22]:
vocab = build_vocab(df['ptext'])
oov_bert = check_coverage(vocab, tokenizer.vocab)

Found embeddings for 54.654% of vocab
Found embeddings for  87.827% of all text


In [23]:
oov_bert[:10]

[('\x89', 1151),
 ('û', 639),
 ('ûªs', 125),
 ('lol', 99),
 ('wildfire', 98),
 ('mh370', 95),
 ('reddit', 83),
 ('legionnaires', 77),
 ('thunderstorm', 69),
 ('ûò', 69)]

### Customized data cleaning

Reference:
- [Basic EDA,Cleaning and GloVe](https://www.kaggle.com/shahules/basic-eda-cleaning-and-glove#Data-Cleaning)

In [24]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)
df['ptext'] = df['ptext'].apply(lambda x : remove_URL(x))

In [25]:
def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)
df['ptext']=df['ptext'].apply(lambda x : remove_html(x))

In [26]:
# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
df['ptext']=df['ptext'].apply(lambda x: remove_emoji(x))

In [27]:
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)
df['ptext']=df['ptext'].apply(lambda x : remove_punct(x))

In [None]:
# !pip install pyspellchecker

In [None]:
from spellchecker import SpellChecker

spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
# df['ctext']=df['ctext'].apply(lambda x : correct_spellings(x))

In [28]:
vocab = build_vocab(df['ptext'])

In [29]:
oov_bert = check_coverage(vocab, tokenizer.vocab)

Found embeddings for 54.652% of vocab
Found embeddings for  87.787% of all text


In [30]:
oov_bert[:10]

[('\x89', 1151),
 ('û', 639),
 ('ûªs', 125),
 ('lol', 99),
 ('wildfire', 98),
 ('mh370', 95),
 ('reddit', 83),
 ('legionnaires', 77),
 ('thunderstorm', 69),
 ('ûò', 69)]

### Put it all together

In [48]:
def preprocess(x):
    x = str(x).lower()
    
    x = ' '.join(twt.tokenize(x))
    x = re.sub(r'http\S+', '', x)
    x = re.sub(r'https\S+', '', x)
    
    
    for punct in "/-'":
        x = x.replace(punct, ' ')
    for punct in '&':
        x = x.replace(punct, f' {punct} ')
    for punct in '?!.,"#$%\'()*+-/:;<=>@[\\]^_`{|}~' + '“”’':
        x = x.replace(punct, '')
        
    x = re.sub(r'[^a-z0-9] \n', '', x)
    
    return x

In [49]:
# apply to train and test text
train['text'] = train['text'].apply(preprocess)
test['text'] = test['text'].apply(preprocess)

In [50]:
vocab_train = build_vocab(train['text'])
oov_bert = check_coverage(vocab_train, tokenizer.vocab)

Found embeddings for 58.362% of vocab
Found embeddings for  87.892% of all text


In [51]:
vocab_test = build_vocab(test['text'])
oov_bert = check_coverage(vocab_test, tokenizer.vocab)

Found embeddings for 65.855% of vocab
Found embeddings for  87.673% of all text


### Model

In [40]:
# load BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained(config.model_name)

In [41]:
# define model
class Model(torch.nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        # pre-trained BERT model by HuggingFace
        pretrained_weights = config.model_name
        # from official documentation: https://github.com/huggingface/transformers#quick-tour
        self.base_model = BertModel.from_pretrained(pretrained_weights)
        # add dense layer to the end
        # last hidden layer of BERT has 768 units, output is binary classification
        self.fc1 = torch.nn.Linear(768, 1)
        
    def forward(self, ids, masks):
        # return last layer output
        x = self.base_model(ids, attention_mask=masks)[1]
        # send to fc
        x = self.fc1(x)
        return x

In [42]:
# Torch define device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [43]:
# load model and put on GPU
model = Model()
model.to(device)

Model(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [44]:
def bert_encode(text, tokenizer, max_len=128):
    """ BERT encoder for text 
    
    Return:
    @tokens: input token ids with 0s padding
    @pad_masks: 1 for inputs and 0 for paddings
    """
    # tokenize text using BERT tokenizer
    text = tokenizer.tokenize(text)
    # remove 2 tokens for start and end token
    text = text[:max_len-2]
    # add start and end token
    input_sequence = ["[CLS]"] + text + ["[SEP]"]
    # convert token to token_id
    tokens = tokenizer.convert_tokens_to_ids(input_sequence)
    # the rest of max_len need to be pad
    pad_len = max_len - len(input_sequence)
    # padding to max_len
    tokens += [0] * pad_len
    # masking, 1 for inputs and 0 for paddings
    pad_masks = [1] * len(input_sequence) + [0] * pad_len
    return tokens, pad_masks

## Training

In [52]:
# Use first 6000 for training, rest for validation
train_text = train.text[:config.sample_size]
val_text = train.text[config.sample_size:]

In [53]:
# encode text and get mask
train_tokens = []
train_pad_masks = []
for text in train_text:
    tokens, masks = bert_encode(text, tokenizer)
    train_tokens.append(tokens)
    train_pad_masks.append(masks)

train_tokens = np.array(train_tokens)
train_pad_masks = np.array(train_pad_masks)

In [54]:
# same for validation
val_tokens = []
val_pad_masks = []
for text in val_text:
    tokens, masks = bert_encode(text, tokenizer)
    val_tokens.append(tokens)
    val_pad_masks.append(masks)

val_tokens = np.array(val_tokens)
val_pad_masks = np.array(val_pad_masks)

In [55]:
class Dataset(torch.utils.data.Dataset):
    """build training pytorch dataset """
    def __init__(self, train_tokens, train_pad_masks, targets):
        
        super(Dataset, self).__init__()
        self.train_tokens = train_tokens
        self.train_pad_masks = train_pad_masks
        self.targets = targets
        
    def __getitem__(self, index):
        
        tokens = self.train_tokens[index]
        masks = self.train_pad_masks[index]
        target = self.targets[index]
        
        return (tokens, masks), target
    
    def __len__(self,):
        
        return len(self.train_tokens)

In [56]:
# build training dataset
train_dataset = Dataset(
    train_tokens=train_tokens,
    train_pad_masks=train_pad_masks,
    targets=train.target[:config.sample_size]
)

In [57]:
# build training dataloader
train_dataloader = torch.utils.data.DataLoader(
    dataset=train_dataset, 
    batch_size=config.bs, 
    shuffle=True
)

In [58]:
# define loss and optimizer
criterion = torch.nn.BCEWithLogitsLoss()
# Use Adam Optimizer with learning rate of 0.00001
opt = torch.optim.Adam(model.parameters(), lr=config.max_lr)

In [59]:
# train model
model.train()
y_preds = []

# train 2 epochs
for epoch in range(config.epochs):

    for i, ((tokens, masks), target) in enumerate(train_dataloader):
        y_pred = model(
                    tokens.long().to(device), 
                    masks.long().to(device)
                )
        loss = criterion(y_pred, target[:, None].float().to(device))
        opt.zero_grad()
        loss.backward()
        opt.step()
        # print("Step:", i)
    print('\rEpoch: %d/%d, %f%% loss: %0.2f'% (epoch+1, config.epochs, (i+1)/len(train_dataloader)*100, loss.item()), end='')
    print()

Epoch: 1/2, 100.000000% loss: 0.12
Epoch: 2/2, 100.000000% loss: 0.19


## Validation

In [60]:
# build validation dataset and dataloader
    val_dataset = Dataset(
        train_tokens=val_tokens,
        train_pad_masks=val_pad_masks,
        targets=train.target[config.sample_size:].reset_index(drop=True)
    )
    val_dataloader = torch.utils.data.DataLoader(
        dataset=val_dataset, 
        batch_size=3, 
        shuffle=False
    )

In [61]:
# define accuracy metric
def accuracy(y_actual, y_pred):
    y_ = y_pred > 0
    return np.sum(y_actual == y_).astype('int') / y_actual.shape[0]

In [62]:
# evaluate model on val dataset
model.eval()
avg_acc = 0
for i, ((tokens, masks), target) in enumerate(val_dataloader):

    y_pred = model(
                tokens.long().to(device), 
                masks.long().to(device), 
            )
    loss = criterion(y_pred, 
                     target[:, None].float().to(device))
    acc = accuracy(target.cpu().numpy(), 
                   y_pred.detach().cpu().numpy().squeeze())
    avg_acc += acc
    print('\r%0.2f%% loss: %0.2f, accuracy %0.2f'% ((i+1)/len(val_dataloader)*100, loss.item(), acc), end='')
print('\nAverage accuracy: ', avg_acc / len(val_dataloader))

100.00% loss: 0.02, accuracy 1.00
Average accuracy:  0.8283767038413887


## Testing

In [63]:
# define test dataset
class TestDataset(torch.utils.data.Dataset):

    def __init__(self, test_tokens, test_pad_masks):

        super(TestDataset, self).__init__()
        self.test_tokens = test_tokens
        self.test_pad_masks = test_pad_masks

    def __getitem__(self, index):

        tokens = self.test_tokens[index]
        masks = self.test_pad_masks[index]

        return (tokens, masks)

    def __len__(self,):

        return len(self.test_tokens)

In [64]:
# encode test text and get mask
test_tokens = []
test_pad_masks = []
for text in test.text:
    tokens, masks = bert_encode(text, tokenizer)
    test_tokens.append(tokens)
    test_pad_masks.append(masks)

test_tokens = np.array(test_tokens)
test_pad_masks = np.array(test_pad_masks)

In [65]:
# build test dataset and dataloader
    test_dataset = TestDataset(
        test_tokens=test_tokens,
        test_pad_masks=test_pad_masks
    )
    test_dataloader = torch.utils.data.DataLoader(
        dataset=test_dataset, 
        batch_size=3, 
        shuffle=False
    )

In [66]:
# get result from test dataset
model.eval()
y_preds = []
for (tokens, masks) in test_dataloader:

    y_pred = model(
                tokens.long().to(device), 
                masks.long().to(device), 
            )
    y_preds += y_pred.detach().cpu().numpy().squeeze().tolist()

In [67]:
# get submission dataframe
submission_df = pd.read_csv(str(DATA_DIR / 'sample_submission.csv'))
submission_df['target'] = (np.array(y_preds) > 0).astype('int')
print(submission_df.target.value_counts())
print(submission_df.head())

0    1863
1    1400
Name: target, dtype: int64
   id  target
0   0       1
1   2       1
2   3       1
3   9       1
4  11       1


In [69]:
# output csv
submission_df.to_csv(str(DATA_DIR / 'submission-02.csv'), index=False)

#### Final Score: 0.821