In [42]:
import pandas as pd
import preprocessor as prep_t 
import spacy
import numpy as np
import string
import re
import time
import datetime
import random
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup

# Goal

**This notebooks aims to model a sequence classifier for disaster tweets https://www.kaggle.com/c/nlp-getting-started based on BERT. The main idea of this notebook's approach is to clean the tweets and use only tweets corpus to fine-tune BERT with Huggingface's transformer.**

# Data set

**Load**

In [43]:
df = pd.read_csv('../data/train.csv')
df.shape

(7613, 5)

# Preprocesing

## Mislabeled data

There are some mislabeled data. We re-label them manually to avoid the introduction of bias in the training process

In [8]:
df_mislabeled = df.groupby(['text']).nunique().sort_values(by='target', ascending=False)
df_mislabeled = df_mislabeled[df_mislabeled['target'] > 1]['target']
df_mislabeled.index.tolist()[:2]

['like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit',
 'Hellfire! We don\x89Ûªt even want to think about it or mention it so let\x89Ûªs not do anything that leads to it #islam!']

Re-labelling

In [9]:
df['target_relabeled'] = df['target'].copy() 
df.loc[df['text'] == 'like for the music video I want some real action shit like burning buildings and police chases not some weak ben winston shit', 'target_relabeled'] = 0
df.loc[df['text'] == 'Hellfire is surrounded by desires so be careful and donÛªt let your desires control you! #Afterlife', 'target_relabeled'] = 0
df.loc[df['text'] == 'To fight bioterrorism sir.', 'target_relabeled'] = 0
df.loc[df['text'] == '.POTUS #StrategicPatience is a strategy for #Genocide; refugees; IDP Internally displaced people; horror; etc. https://t.co/rqWuoy1fm4', 'target_relabeled'] = 1
df.loc[df['text'] == 'CLEARED:incident with injury:I-495  inner loop Exit 31 - MD 97/Georgia Ave Silver Spring', 'target_relabeled'] = 1
df.loc[df['text'] == '#foodscare #offers2go #NestleIndia slips into loss after #Magginoodle #ban unsafe and hazardous for #humanconsumption', 'target_relabeled'] = 0
df.loc[df['text'] == 'In #islam saving a person is equal in reward to saving all humans! Islam is the opposite of terrorism!', 'target_relabeled'] = 0
df.loc[df['text'] == 'Who is bringing the tornadoes and floods. Who is bringing the climate change. God is after America He is plaguing her\n \n#FARRAKHAN #QUOTE', 'target_relabeled'] = 1
df.loc[df['text'] == 'RT NotExplained: The only known image of infamous hijacker D.B. Cooper. http://t.co/JlzK2HdeTG', 'target_relabeled'] = 1
df.loc[df['text'] == "Mmmmmm I'm burning.... I'm burning buildings I'm building.... Oooooohhhh oooh ooh...", 'target_relabeled'] = 0
df.loc[df['text'] == "wowo--=== 12000 Nigerian refugees repatriated from Cameroon", 'target_relabeled'] = 0
df.loc[df['text'] == "He came to a land which was engulfed in tribal war and turned it into a land of peace i.e. Madinah. #ProphetMuhammad #islam", 'target_relabeled'] = 0
df.loc[df['text'] == "Hellfire! We donÛªt even want to think about it or mention it so letÛªs not do anything that leads to it #islam!", 'target_relabeled'] = 0
df.loc[df['text'] == "The Prophet (peace be upon him) said 'Save yourself from Hellfire even if it is by giving half a date in charity.'", 'target_relabeled'] = 0
df.loc[df['text'] == "Caution: breathing may be hazardous to your health.", 'target_relabeled'] = 1
df.loc[df['text'] == "I Pledge Allegiance To The P.O.P.E. And The Burning Buildings of Epic City. ??????", 'target_relabeled'] = 0
df.loc[df['text'] == "#Allah describes piling up #wealth thinking it would last #forever as the description of the people of #Hellfire in Surah Humaza. #Reflect", 'target_relabeled'] = 0
df.loc[df['text'] == "that horrible sinking feeling when youÛªve been at home on your phone for a while and you realise its been on 3G this whole time", 'target_relabeled'] = 0

In [10]:
df = df.drop(columns=['target'])
df = df.rename(columns={'target_relabeled':'target'})

## Simple counts

**Missing values**

In [12]:
df.isnull().sum()/df.shape[0]

id          0.000000
keyword     0.008013
location    0.332720
text        0.000000
target      0.000000
dtype: float64

**Balance between classes**

In [14]:
df.target.value_counts()

0    4354
1    3259
Name: target, dtype: int64

## Missing values

In [22]:
df = df.fillna('')

## Clean text

**Remove special characters \x89** 

In [23]:
df['cleaned_text']=df['text'].apply(lambda x:re.sub(r"(.*[a-zA-Z]?)\x89[^\W]*([a-zA-Z]?.*)", r"\1, \2", x))
df_test['cleaned_text']=df_test['text'].apply(lambda x:re.sub(r"(.*[a-zA-Z]?)\x89[^\W]*([a-zA-Z]?.*)", r"\1, \2", x))

**Clean tweets**

In [24]:
prep_t.set_options(prep_t.OPT.URL, prep_t.OPT.EMOJI, prep_t.OPT.MENTION)

In [25]:
df['cleaned_text'] = df['cleaned_text'].map(prep_t.clean)
df_test['cleaned_text'] = df_test['cleaned_text'].map(prep_t.clean)

**Remove consecutive dots (repeating more than twice)**

In [26]:
consequitivedots = re.compile(r'\.{2,}')
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: consequitivedots.sub(' ... ', x))

**Remove non ascii characters**

In [27]:
def remove_non_ascii(text):
    """
        Remove non-ASCII characters 
    """
    return re.sub(r'[^\x00-\x7f]',r'', text)

In [28]:
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: remove_non_ascii(x))
df['cleaned_text'] = df['cleaned_text'].apply(lambda x: x.replace('#', ' ').replace('@', " "))

df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join(x.split()))

Check cleaned text

In [74]:
for i in df['cleaned_text'].sample(10):
    print(i)

Refugio oil spill may have been costlier bigger than projected
Parents of Colorado theater shooting victim fear copycat massacre Antioch
I liked a video EXTREME PAINT TWISTER
Lying Clinton sinking! Donald Trump singing: Let's Make America Great Again!
Jacksonville family bands together as memorial is planned for toddler who ... - Florida,
Reddit Will Now Quarantine, onlinecommunities reddit amageddon freespeech Business
Sharp rise in women children casualties in Afghan war UN says
thinking of the time that my friend bailed the nite b4 a dead show ... went alone &amp; had a GREAT time. All alone and free to dance. Front row
USGS EQ: M 1.2 - 23km S of Twentynine Palms California: Time2015-08-05 23:54:09 UTC2015-08-05 16: ... EarthQuake
Headed to the massacre Bodies arriving everyday What were those shells you heard Picking the bones up along the way


Notes: We do not transform text in lowercases because some upper cases could give information. Also, we do not clean "too much" the text to preserve the brut information which BERT deals with well. 

# Classifier

## Load BERT

Load uncased pre-trained BERT model (12-layers)

In [94]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = True, # Whether the model returns all hidden-states.
)

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

If GPU

In [95]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)

BERT strcture check: in the package of 

In [98]:
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (28996, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

## Setting optimizer and parameters

**Some useful functions**

In [99]:
class TweetsDataset(Dataset):
    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.reviews)
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]
        encoding = self.tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
        'review_text': review,
        'input_ids': encoding['input_ids'].flatten(),
        'attention_mask': encoding['attention_mask'].flatten(),
        'targets': torch.tensor(target, dtype=torch.long)
        }

def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = TweetsDataset(
        reviews=df.cleaned_text.to_numpy(),
        targets=df.target.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
      )
    return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
    )

In [103]:
def format_time(elapsed):
    """
    Takes a time in seconds and returns a string hh:mm:ss
    """
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [104]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

**We use AdamW optimizer to fine-tune BERT**

In [101]:
EPOCHS = 5

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

BATCH_SIZE = 16
MAX_LEN = 84 ## chosen with max length of texts we observe

train_data_loader = create_data_loader(df, tokenizer, MAX_LEN, BATCH_SIZE)

## We could also split our df into 2 train & validation sets to measure performance on validation set
#train_data_loader = create_data_loader(df_train_bert, tokenizer, MAX_LEN, BATCH_SIZE)
#val_data_loader = create_data_loader(df_val_bert, tokenizer, MAX_LEN, BATCH_SIZE)

## Train

In [108]:
%%time
loss_values = []
for epoch_i in range(0, EPOCHS):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, EPOCHS))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_loss = 0
    
    model.train()
    for step, batch in enumerate(train_data_loader):
        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_data_loader), elapsed))
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        b_input_ids = batch['input_ids'].to(device) #input ids
        b_input_mask = batch['attention_mask'].to(device) #attention masks
        b_labels = batch['targets'].to(device) #labels
        
        model.zero_grad()        
 
        outputs = model(b_input_ids,
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        loss = outputs[0]
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_data_loader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
    
print("")
print("Training complete!")


Training...
  Batch    40  of    476.    Elapsed: 0:04:47.
  Batch    80  of    476.    Elapsed: 0:09:37.
  Batch   120  of    476.    Elapsed: 0:14:26.
  Batch   160  of    476.    Elapsed: 0:19:17.
  Batch   200  of    476.    Elapsed: 0:24:08.
  Batch   240  of    476.    Elapsed: 0:28:58.
  Batch   280  of    476.    Elapsed: 0:34:07.
  Batch   320  of    476.    Elapsed: 0:40:01.
  Batch   360  of    476.    Elapsed: 0:46:43.
  Batch   400  of    476.    Elapsed: 0:52:11.
  Batch   440  of    476.    Elapsed: 0:56:59.

  Average training loss: 0.70
  Training epcoh took: 1:01:17

Training...
  Batch    40  of    476.    Elapsed: 0:06:26.
  Batch    80  of    476.    Elapsed: 0:12:21.
  Batch   120  of    476.    Elapsed: 0:17:34.
  Batch   160  of    476.    Elapsed: 0:22:34.
  Batch   200  of    476.    Elapsed: 0:27:39.
  Batch   240  of    476.    Elapsed: 0:32:52.
  Batch   280  of    476.    Elapsed: 0:38:04.
  Batch   320  of    476.    Elapsed: 0:43:48.
  Batch   360  of  

**Save fine-tuned BERT model**

In [120]:
#model.save_pretrained('./my_fine_tuned_model/')

# Predictions

In [None]:
df_test = pd.read_csv('../data/test.csv')

df_test = df_test.fillna('')

df_test['cleaned_text']=df_test['text'].apply(lambda x:re.sub(r"(.*[a-zA-Z]?)\x89[^\W]*([a-zA-Z]?.*)", r"\1, \2", x))
df_test['cleaned_text'] = df_test['cleaned_text'].map(prep_t.clean)


df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: consequitivedots.sub(' ... ', x))
df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: remove_non_ascii(x))
df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: x.replace('#', ' ').replace('@', " "))
df_test['cleaned_text'] = df_test['cleaned_text'].apply(lambda x: ' '.join(x.split()))

In [114]:
%%time
predictions = []
for review_text in df_test.cleaned_text.values:
    encoded_review = tokenizer.encode_plus(
      review_text,
      max_length=MAX_LEN,
      add_special_tokens=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output[0], dim=1)
    predictions.append(prediction[0].tolist())

CPU times: user 17min 19s, sys: 1min 26s, total: 18min 45s
Wall time: 10min 43s


In [41]:
#output[['id', 'target']].to_csv('submissions.csv', sep=",", index = False)