<a href="https://www.kaggle.com/code/syerramilli/disaster-tweets-finetune-distilbert-transformers?scriptVersionId=180959812" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Introduction

In this notebook, we will use a classification layer which uses a pretrained uncased DistilBERT model as the backbone to process the tweets. We will be using the `Trainer` API from the `transformers` library for finetuning the classifier.

** CHANGELOG **

v1 -> v2:
1. Replaced the default classification layer with custom fully connected layers
2. Froze the weights of the DistilBERT backbone to the pretrained weights - they are not trained further

v2 -> v3:
1. Unfreezing the DistilBERT backbone weights to be trained along with classifier weights
2. Added a preprocessing step prior to model training to remove URLS, numbers, and emoticons
3. Reduced the maximum learning rate in the OneCycleLR policy from 1e-3 to 1e-4

v3 -> v4:
1. Implemented gradual unfreezing of the DistilBERT layers. The weights are initially frozen 

In [1]:
import re
import numpy as np
import pandas as pd
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import OneCycleLR
from transformers import DistilBertForSequenceClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments, ProgressCallback, TrainerCallback
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from datasets import Dataset

from pathlib import Path

2024-06-01 16:19:15.648059: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-01 16:19:15.648165: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-01 16:19:15.732505: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Configuration
BATCH_SIZE = 128
MODEL_NAME = 'distilbert-base-uncased'
NUM_EPOCHS = 15
MAX_LR = 5e-4
WEIGHT_DECAY = 1e-6

## Load the data and check for missing values

In [3]:
data_path = Path('/kaggle/input/nlp-getting-started')
train = pd.read_csv(data_path / 'train.csv')
train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# check for missing values
train.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

## Cleaning text entries

In [5]:
# remove emoticons
# source: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# Preprocess the data
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    # remove urls
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'www\S+', '', tweet)
    
    # remove numbers
    tweet = re.sub(r'[0-9]+(,[0-9])*(\.[0-9]+)*', '', tweet)
    
    # remove emojis
    tweet = remove_emoji(tweet)
    
    return tweet

train['text'] = train['text'].apply(preprocess_tweet)

## Dataset and tokenizer

In [6]:
# create dataset
train_dataset = Dataset.from_pandas(train[['text', 'target']]).class_encode_column("target")

# split into 80-20 training-test splits
train_dataset = train_dataset.train_test_split(test_size = 0.2,seed = 1)

train_dataset

Stringifying the column:   0%|          | 0/7613 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/7613 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'target'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['text', 'target'],
        num_rows: 1523
    })
})

In [7]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
# Create a Dataset object from the DataFrame
tokenized_dataset = train_dataset.map(tokenize, batched=True, batch_size=BATCH_SIZE)

Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

In [9]:
# Renaming the target column
tokenized_dataset = tokenized_dataset.rename_column("target", "labels")
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 6090
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1523
    })
})

## Load the pretrained model

In [10]:
class CustomDistilBertForSequenceClassification(DistilBertForSequenceClassification):
    def __init__(self, config):
        super().__init__(config)
        self.pre_classifier=None
        # classification block - 2 ff layers followed by an output layer
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(config.dim, 128),
            torch.nn.BatchNorm1d(128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(128, 128),
            torch.nn.BatchNorm1d(128),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(128, config.num_labels)
        )
    
    def forward(
        self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, 
        labels=None, output_attentions=None, output_hidden_states=None, return_dict=None
    ):
        outputs = self.distilbert(
            input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, 
            inputs_embeds=inputs_embeds, output_attentions=output_attentions, 
            output_hidden_states=output_hidden_states, 
            return_dict=return_dict
        )
        hidden_state = outputs[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        logits = self.classifier(pooled_output) # (bs, num_labels)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = torch.nn.MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            elif self.config.problem_type == "single_label_classification":
                loss_fct = torch.nn.CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = torch.nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return torch.nn.utils.rnn.PackedSequence(
            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
        )
    
    def freeze_base_model(self):
        for param in self.distilbert.parameters():
            param.requires_grad = False

    def unfreeze_last_layers(self, num_layers):
        for layer in self.distilbert.transformer.layer[-num_layers:]:
            for param in layer.parameters():
                param.requires_grad = True

In [11]:
model = CustomDistilBertForSequenceClassification.from_pretrained(MODEL_NAME)
model.freeze_base_model()  # Freeze all layers initially

# print model architecture
print(model)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of CustomDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.0.bias', 'classifier.0.weight', 'classifier.1.bias', 'classifier.1.num_batches_tracked', 'classifier.1.running_mean', 'classifier.1.running_var', 'classifier.1.weight', 'classifier.4.bias', 'classifier.4.weight', 'classifier.5.bias', 'classifier.5.num_batches_tracked', 'classifier.5.running_mean', 'classifier.5.running_var', 'classifier.5.weight', 'classifier.8.bias', 'classifier.8.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomDistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=Fa

## Training

In [12]:
class GradualUnfreezingCallback(TrainerCallback):
    def __init__(self, model, intervals):
        self.model = model
        self.intervals = intervals
        self.current_interval = 0

    def on_epoch_end(self, args, state, control, **kwargs):
        if state.epoch in self.intervals:
            num_layers = self.intervals[state.epoch]
            self.model.unfreeze_last_layers(num_layers)
            self.current_interval += 1
            print(f"Unfreezing last {num_layers} layers at epoch {int(state.epoch)}")

In [13]:
def compute_metrics(pred):
    labels = pred.label_ids
    logits = pred.predictions
    preds = logits.argmax(-1) # might not be the best threshold
    
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    auc = roc_auc_score(labels, logits[:, -1])
    
    return {"accuracy": accuracy, "f1": f1, "auc": auc}

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    report_to="none", # disable all integrations,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy="epoch", # to log training loss
    load_best_model_at_end=True, # selects the best performing model
    metric_for_best_model='f1',
    save_total_limit=1 # saves only the best model
)

In [14]:
# define the optimizer and scheduler
optimizer = AdamW(model.parameters(), weight_decay= WEIGHT_DECAY)
lr_scheduler = OneCycleLR(
    optimizer, max_lr = MAX_LR,
    epochs = NUM_EPOCHS,
    steps_per_epoch=int(np.ceil(len(tokenized_dataset['train']) / BATCH_SIZE)),
    anneal_strategy='cos',
    pct_start=0.3
)

# Define the intervals for unfreezing layers
unfreeze_intervals = {5: 2, 9: 4, 12: 6}
unfreezing_callback = GradualUnfreezingCallback(model, unfreeze_intervals)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    optimizers=(optimizer, lr_scheduler),
    callbacks=[unfreezing_callback]
)

# Fine-tune the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Auc
1,0.652,0.603012,0.772817,0.705782,0.830218
2,0.5191,0.44526,0.797111,0.741423,0.87303
3,0.4457,0.422219,0.811556,0.77095,0.88318
4,0.4166,0.445277,0.800394,0.775148,0.882149
5,0.4044,0.410543,0.819435,0.752475,0.875988
6,0.4225,0.386582,0.832567,0.790123,0.898424
7,0.3607,0.400438,0.834537,0.7984,0.892643
8,0.2906,0.42225,0.825345,0.77069,0.882279
9,0.2108,0.477073,0.818122,0.780681,0.87758
10,0.2177,0.569598,0.810243,0.738934,0.879695


Unfreezing last 2 layers at epoch 5
Unfreezing last 4 layers at epoch 9
Unfreezing last 6 layers at epoch 12


TrainOutput(global_step=720, training_loss=0.2896939320696725, metrics={'train_runtime': 240.9899, 'train_samples_per_second': 379.062, 'train_steps_per_second': 2.988, 'total_flos': 1923816593173656.0, 'train_loss': 0.2896939320696725, 'epoch': 15.0})

## Test predictions and submission

In [15]:
test = pd.read_csv(data_path / 'test.csv')
test['text'] = test['text'].apply(preprocess_tweet)
test_dataset_tokenized = (
    Dataset
    .from_pandas(test[['text']])
    .map(tokenize, batched=True, batch_size=BATCH_SIZE)
)

# returns logits
test_predictions = trainer.predict(test_dataset_tokenized)

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [16]:
submission = pd.DataFrame({
    'id': test['id'],
    'target':  test_predictions.predictions.argmax(-1)
})

submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [17]:
submission.to_csv('submission.csv', index=False)