##### https://www.kaggle.com/competitions/nlp-getting-started
##### Addison Howard, devrishi, Phil Culliton, Yufeng Guo. (2019). Natural Language Processing with Disaster Tweets


In [1]:
import numpy as np 
import pandas as pd
import torch
from torch.utils.data import TensorDataset

In [2]:
### Dataset: Disaster: target==1. If no_Disaster: traget==0
df_train = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")

In [3]:
tmp = df_train.sample(frac=1).reset_index(drop=True)
print(len(df_train))

7613


In [4]:
# train val spilit from train-data
df_train = tmp[:6000]
df_val   = tmp[6000:]
df_mini  = tmp[:100]
print(df_train.shape)
print(df_val.shape)


(6000, 5)
(1613, 5)


##### https://huggingface.co/docs/transformers/model_doc/bert
##### https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertForSequenceClassification.forward.example

### Fine-tune BERT using transformers.trainer


In [5]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      num_labels = 2,
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Custom Dataset class for BERT & Pytorch
# Dataset will have: [Input_ids, attention_mask,labels]
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.data = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['target']
        encoding = self.tokenizer(
            text, 
            padding='max_length', 
            truncation=True, 
            max_length=128, 
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }
        return item

In [7]:
# Create Dataset and DataLoader
train_dataset = TextDataset(df_train, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = TextDataset(df_val, tokenizer)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=True)

mini_dataset = TextDataset(df_mini, tokenizer)
mini_dataloader = DataLoader(mini_dataset, batch_size=32, shuffle=True)

In [8]:
#dataset looks something like this
print(val_dataset[0])

{'input_ids': tensor([  101,  9619,  2344,  2006, 27686, 14128,  2012, 20116,  2595,  4315,
        12502,  3672,  2609,  1011, 20021,  2739, 16074,  8299,  1024,  1013,
         1013,  1056,  1012,  2522,  1013, 21025,  2229,  5358,  2290, 26677,
         2099,  8299,  1024,  1013,  1013,  1056,  1012,  2522,  1013, 13221,
        10343,  2290,  2243,  2546,  2487,  4143,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [9]:
# Set training arguments
training_args = TrainingArguments(
    output_dir='./data',            # Output directory
    num_train_epochs=1,                # Number of training epochs
    per_device_train_batch_size=32,     # Batch size for training
    per_device_eval_batch_size=32,      # Batch size for evaluation
    warmup_steps=10,                  # Number of warmup steps
    weight_decay=0.001,                 # Strength of weight decay
    eval_strategy="epoch",       # Evaluate every epoch
    logging_dir='./data',              # Directory for logs
    logging_steps=1,
)


In [10]:
# Define the Trainer
trainer = Trainer(
    model=model,                        # The fine-tuning model
    args=training_args,                 # Training arguments
    train_dataset=train_dataset,        # Training dataset
    eval_dataset=val_dataset           # Evaluation dataset
)

# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.221,0.367725


TrainOutput(global_step=188, training_loss=0.4475020633891542, metrics={'train_runtime': 1202.8555, 'train_samples_per_second': 4.988, 'train_steps_per_second': 0.156, 'total_flos': 394666583040000.0, 'train_loss': 0.4475020633891542, 'epoch': 1.0})

In [11]:
###Save the fine-tuned model and tokenizer
#model.save_pretrained('./data/fine_tuned_bert')
#tokenizer.save_pretrained('./data/fine_tuned_bert')

('./data/fine_tuned_bert/tokenizer_config.json',
 './data/fine_tuned_bert/special_tokens_map.json',
 './data/fine_tuned_bert/vocab.txt',
 './data/fine_tuned_bert/added_tokens.json')

In [12]:
# Function to calculate accuracy
def compute_accuracy(model, dataloader):
    correct_predictions = 0
    total_predictions = 0

    # Make sure model is in evaluation mode
    model.eval()

    # Disable gradient calculation for faster inference
    with torch.no_grad():
        for batch in dataloader:
            # Get input and labels from the batch
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['label']

            # Move data to the same device as the model
            input_ids = input_ids.to(model.device)
            attention_mask = attention_mask.to(model.device)
            labels = labels.to(model.device)

            # Get the model outputs (logits)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            # Get the predicted class by finding the index of the maximum logit (for binary or multi-class classification)
            predictions = torch.argmax(logits, dim=1)

            # Count the number of correct predictions
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    # Calculate accuracy
    accuracy = correct_predictions / total_predictions
    return accuracy


In [13]:
### Accuracy for train/val samples
print('Accuracy Train:',compute_accuracy(model, train_dataloader))
print('Accuracy   Val:',compute_accuracy(model, val_dataloader))

Accuracy Train: 0.8805
Accuracy   Val: 0.8512089274643522
