## Introduction

In this notebook, we will train a LSTM based classification model to predict whether the event described in a tweet is a real disaster or not.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import fasttext
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer
from transformers import BertTokenizerFast

from pathlib import Path
plt.style.use('ggplot')

In [2]:
## CONFIG
MAX_VOCAB_SIZE = 5000
MIN_FREQUENCY = 10
MAX_SEQ_LEN = 50
BATCH_SIZE = 128
MAX_LR = 1e-4
NUM_EPOCHS = 20

NUM_LSTM_LAYERS = 2
EMBEDDING_DIM = 128
HIDDEN_DIM = EMBEDDING_DIM // 2

## Load the data and preprocessing

In [3]:
data_path = Path('/kaggle/input/nlp-getting-started')
train = pd.read_csv(data_path / 'train.csv')
train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# remove emoticons
# source: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# Preprocess the data
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    # remove urls
    tweet = re.sub(r'http\S+', '', tweet)
    
    # remove numbers
    tweet = re.sub(r'[0-9]+(,[0-9])*(\.[0-9]+)*', '', tweet)
    
    # remove emojis
    tweet = remove_emoji(tweet)
    
    return tweet

train['text'] = train['text'].apply(preprocess_tweet)

## WordPiece Tokenizer

In [5]:
# tokenizer with empty WordPiece model
# unk_token - token for unknown words; using BERT convention
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [6]:
# preprocessing steps
# normalizer: NFD normalization, lower case, strip accents
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

# pre-tokenizer: split text using whitespace and punctuation
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [7]:
special_tokens = ["[UNK]", "[PAD]"]
trainer = trainers.WordPieceTrainer(
    vocab_size=MAX_VOCAB_SIZE,
    min_frequency=MIN_FREQUENCY,
    special_tokens=special_tokens,
    show_progress=True
)

tokenizer.train_from_iterator(train['text'].tolist(), trainer=trainer)

# Optional: specifiy decoder
tokenizer.decoder = decoders.WordPiece(prefix="##")






In [8]:
# Save the tokenizer
tokenizer.save("wordpiece_tokenizer")

## Pretraining token embeddings using fasttext

In [9]:
# Save tokenized texts to a file
with open("tokenized_texts.txt", "w") as f:
    for text in train['text']:
        f.write(" ".join(tokenizer.encode(text).tokens) + "\n")
        
# Train FastText model 
fasttext_model = fasttext.train_unsupervised('tokenized_texts.txt', model='skipgram', dim=EMBEDDING_DIM)

# Save the model 
fasttext_model.save_model("fasttext_model.bin")

Read 0M words
Number of words:  3265
Number of labels: 0
Progress: 100.0% words/sec/thread:   52180 lr:  0.000000 avg.loss:  2.582424 ETA:   0h 0m 0s


In [10]:
# Create an embedding matrix
vocab_size = tokenizer.get_vocab_size()
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

# Initialize the embedding matrix with FastText vectors
for word, idx in tokenizer.get_vocab().items():
    embedding_matrix[idx] = fasttext_model.get_word_vector(word)
    
# Convert embedding matrix to a tensor
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

## Training and Validation Datasets

In [11]:
# pad_to
pad_token_id = tokenizer.token_to_id("[PAD]")


# Function to pad and truncate sequences
def pad_and_truncate(sequence, max_length, pad_token_id):
    if len(sequence) > max_length:
        return sequence[:max_length]
    return sequence + [pad_token_id] * (max_length - len(sequence))


class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, pad_token_id):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.pad_token_id = pad_token_id

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized_text = self.tokenizer.encode(text).ids
        return torch.tensor(tokenized_text), torch.tensor(label)

# Custom collate function for dynamic padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    
    # Compute max length in the batch
    max_length = max(MAX_SEQ_LEN, max(len(text) for text in texts))
    
    # Pad sequences to the max length
    padded_texts = [pad_and_truncate(text.tolist(), max_length, pad_token_id) for text in texts]
    
    # Convert to tensors
    padded_texts = torch.tensor(padded_texts)
    labels = torch.tensor(labels)
    
    return padded_texts, labels

In [12]:
# split into trainining and validation datasets
train_tweets, val_tweets, train_labels, val_labels = train_test_split(
    train['text'].tolist(), train['target'].tolist(), test_size=0.2, random_state=42
)

# Create datasets
train_dataset = TweetDataset(train_tweets, train_labels, tokenizer, pad_token_id)
val_dataset = TweetDataset(val_tweets, val_labels, tokenizer, pad_token_id)

# Create dataloaders with custom collate function
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

## LSTM classifier

In [13]:
class BiLSTMClassifier(nn.Module):
    def __init__(
        self, vocab_size, embedding_dim, hidden_dim, num_lstm_layers, 
        output_dim, pad_token_id,
        embedding_matrix=None
    ):
        super(BiLSTMClassifier, self).__init__()
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=pad_token_id)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_token_id)
        self.lstm_layers = nn.ModuleList([
            nn.LSTM(
                embedding_dim if i == 0 else hidden_dim * 2,
                hidden_dim,
                num_layers=1,
                bidirectional=True,
                batch_first=True
            ) for i in range(num_lstm_layers)
        ])
        
        
        self.lstm_dropout = nn.Dropout(0.2)
        
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, output_dim)
        )
        self.pad_token_id = pad_token_id

    def forward(self, text):
        embedded = self.embedding(text)
        x = embedded
        
        for lstm in self.lstm_layers:
            lstm_out, _ = lstm(x)
            x = x + lstm_out
        
        # Create mask for padding tokens
        mask = (text != self.pad_token_id).unsqueeze(2).type(torch.float32)
        
        # Apply mask to LSTM outputs
        masked_lstm_out = lstm_out * mask
        
        # Sum the outputs and divide by the number of valid (non-pad) tokens
        pooled = masked_lstm_out.sum(dim=1) / mask.sum(dim=1)
        
        output = self.lstm_dropout(pooled)
        return self.fc(output)


## Training the model

In [14]:
from tqdm import tqdm

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=5, checkpoint_path='best_model.pth'):
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        
        # Create a progress bar for the entire epoch
        pbar = tqdm(total=len(train_loader), desc=f'Epoch {epoch + 1}/{epochs}', unit='batch')

        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device).float()
            optimizer.zero_grad()
            predictions = model(texts).squeeze(1)
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()  # Update the learning rate

            epoch_loss += loss.item()
            pbar.set_postfix({'Train Loss': epoch_loss / (pbar.n + 1)})
            pbar.update()

        pbar.close()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for texts, labels in val_loader:
                texts, labels = texts.to(device), labels.to(device).float()
                predictions = model(texts).squeeze(1)
                loss = criterion(predictions, labels)
                val_loss += loss.item()

        val_loss /= len(val_loader)
        print(f'Epoch {epoch + 1}, Train Loss: {epoch_loss / len(train_loader):.4f}, Validation Loss: {val_loss:.4f}')

        # Reopen progress bar for next epoch
        tqdm.write(f'Epoch {epoch + 1} completed. Train Loss: {epoch_loss / len(train_loader):.4f}, Validation Loss: {val_loss:.4f}')
        
        # Checkpoint the model if validation loss decreases
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), checkpoint_path)
            print(f'Saved Best Model at Epoch {epoch + 1} with Validation Loss: {val_loss:.4f}')
            
        
    # set to best checkpoint
    model.load_state_dict(torch.load(checkpoint_path))
    print(f'Best validation loss: {best_val_loss:.4f}')


In [15]:
# Model parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = tokenizer.get_vocab_size()

# Set up the model, optimizer, and criterion
model = BiLSTMClassifier(
    vocab_size, EMBEDDING_DIM, HIDDEN_DIM, NUM_LSTM_LAYERS, 1,  pad_token_id,
    embedding_matrix=embedding_matrix
).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)  # Set initial LR for the optimizer
criterion = nn.BCEWithLogitsLoss()

# Setup OneCycleLR
total_steps = len(train_loader) * NUM_EPOCHS # total training steps = num_epochs * batches_per_epoch
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=MAX_LR, total_steps=total_steps)

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=NUM_EPOCHS)

Epoch 1/20: 100%|██████████| 48/48 [00:01<00:00, 24.10batch/s, Train Loss=0.685]


Epoch 1, Train Loss: 0.6853, Validation Loss: 0.6756
Epoch 1 completed. Train Loss: 0.6853, Validation Loss: 0.6756
Saved Best Model at Epoch 1 with Validation Loss: 0.6756


Epoch 2/20: 100%|██████████| 48/48 [00:01<00:00, 45.93batch/s, Train Loss=0.669]


Epoch 2, Train Loss: 0.6687, Validation Loss: 0.6179
Epoch 2 completed. Train Loss: 0.6687, Validation Loss: 0.6179
Saved Best Model at Epoch 2 with Validation Loss: 0.6179


Epoch 3/20: 100%|██████████| 48/48 [00:01<00:00, 46.78batch/s, Train Loss=0.634]


Epoch 3, Train Loss: 0.6336, Validation Loss: 0.5842
Epoch 3 completed. Train Loss: 0.6336, Validation Loss: 0.5842
Saved Best Model at Epoch 3 with Validation Loss: 0.5842


Epoch 4/20: 100%|██████████| 48/48 [00:01<00:00, 46.68batch/s, Train Loss=0.592]


Epoch 4, Train Loss: 0.5924, Validation Loss: 0.5620
Epoch 4 completed. Train Loss: 0.5924, Validation Loss: 0.5620
Saved Best Model at Epoch 4 with Validation Loss: 0.5620


Epoch 5/20: 100%|██████████| 48/48 [00:01<00:00, 45.15batch/s, Train Loss=0.571]


Epoch 5, Train Loss: 0.5709, Validation Loss: 0.5481
Epoch 5 completed. Train Loss: 0.5709, Validation Loss: 0.5481
Saved Best Model at Epoch 5 with Validation Loss: 0.5481


Epoch 6/20: 100%|██████████| 48/48 [00:01<00:00, 45.94batch/s, Train Loss=0.556]


Epoch 6, Train Loss: 0.5557, Validation Loss: 0.5310
Epoch 6 completed. Train Loss: 0.5557, Validation Loss: 0.5310
Saved Best Model at Epoch 6 with Validation Loss: 0.5310


Epoch 7/20: 100%|██████████| 48/48 [00:01<00:00, 46.33batch/s, Train Loss=0.537]


Epoch 7, Train Loss: 0.5371, Validation Loss: 0.5144
Epoch 7 completed. Train Loss: 0.5371, Validation Loss: 0.5144
Saved Best Model at Epoch 7 with Validation Loss: 0.5144


Epoch 8/20: 100%|██████████| 48/48 [00:01<00:00, 45.46batch/s, Train Loss=0.506]


Epoch 8, Train Loss: 0.5055, Validation Loss: 0.5010
Epoch 8 completed. Train Loss: 0.5055, Validation Loss: 0.5010
Saved Best Model at Epoch 8 with Validation Loss: 0.5010


Epoch 9/20: 100%|██████████| 48/48 [00:01<00:00, 42.28batch/s, Train Loss=0.464]


Epoch 9, Train Loss: 0.4635, Validation Loss: 0.4819
Epoch 9 completed. Train Loss: 0.4635, Validation Loss: 0.4819
Saved Best Model at Epoch 9 with Validation Loss: 0.4819


Epoch 10/20: 100%|██████████| 48/48 [00:01<00:00, 43.10batch/s, Train Loss=0.404]


Epoch 10, Train Loss: 0.4041, Validation Loss: 0.4429
Epoch 10 completed. Train Loss: 0.4041, Validation Loss: 0.4429
Saved Best Model at Epoch 10 with Validation Loss: 0.4429


Epoch 11/20: 100%|██████████| 48/48 [00:01<00:00, 44.83batch/s, Train Loss=0.357]


Epoch 11, Train Loss: 0.3566, Validation Loss: 0.4495
Epoch 11 completed. Train Loss: 0.3566, Validation Loss: 0.4495


Epoch 12/20: 100%|██████████| 48/48 [00:01<00:00, 43.84batch/s, Train Loss=0.319]


Epoch 12, Train Loss: 0.3188, Validation Loss: 0.4427
Epoch 12 completed. Train Loss: 0.3188, Validation Loss: 0.4427
Saved Best Model at Epoch 12 with Validation Loss: 0.4427


Epoch 13/20: 100%|██████████| 48/48 [00:01<00:00, 44.60batch/s, Train Loss=0.291]


Epoch 13, Train Loss: 0.2912, Validation Loss: 0.4665
Epoch 13 completed. Train Loss: 0.2912, Validation Loss: 0.4665


Epoch 14/20: 100%|██████████| 48/48 [00:01<00:00, 42.93batch/s, Train Loss=0.274]


Epoch 14, Train Loss: 0.2739, Validation Loss: 0.4607
Epoch 14 completed. Train Loss: 0.2739, Validation Loss: 0.4607


Epoch 15/20: 100%|██████████| 48/48 [00:01<00:00, 44.78batch/s, Train Loss=0.266]


Epoch 15, Train Loss: 0.2656, Validation Loss: 0.4747
Epoch 15 completed. Train Loss: 0.2656, Validation Loss: 0.4747


Epoch 16/20: 100%|██████████| 48/48 [00:01<00:00, 45.12batch/s, Train Loss=0.252]


Epoch 16, Train Loss: 0.2520, Validation Loss: 0.4767
Epoch 16 completed. Train Loss: 0.2520, Validation Loss: 0.4767


Epoch 17/20: 100%|██████████| 48/48 [00:01<00:00, 44.82batch/s, Train Loss=0.245]


Epoch 17, Train Loss: 0.2454, Validation Loss: 0.4860
Epoch 17 completed. Train Loss: 0.2454, Validation Loss: 0.4860


Epoch 18/20: 100%|██████████| 48/48 [00:01<00:00, 45.58batch/s, Train Loss=0.242]


Epoch 18, Train Loss: 0.2424, Validation Loss: 0.4819
Epoch 18 completed. Train Loss: 0.2424, Validation Loss: 0.4819


Epoch 19/20: 100%|██████████| 48/48 [00:01<00:00, 43.72batch/s, Train Loss=0.244]


Epoch 19, Train Loss: 0.2439, Validation Loss: 0.4836
Epoch 19 completed. Train Loss: 0.2439, Validation Loss: 0.4836


Epoch 20/20: 100%|██████████| 48/48 [00:01<00:00, 44.36batch/s, Train Loss=0.237]


Epoch 20, Train Loss: 0.2367, Validation Loss: 0.4882
Epoch 20 completed. Train Loss: 0.2367, Validation Loss: 0.4882
Best validation loss: 0.4427


In [16]:
def evaluate_model(model, val_loader):
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device).float()
            predictions = model(texts).squeeze(1)
            probs = torch.sigmoid(predictions)
            preds = torch.round(probs)
            all_probs.extend(probs.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    roc_auc = roc_auc_score(all_labels, all_probs)
    return accuracy, f1, roc_auc

# Example of evaluating the model
accuracy, f1, roc_auc = evaluate_model(model, val_loader)
print(f'Accuracy: {accuracy:.4f}, F1: {f1:.4f}, ROC AUC: {roc_auc:.4f}')

Accuracy: 0.8102, F1: 0.7652, ROC AUC: 0.8656


## Test predictions and submission

In [17]:
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length, pad_token_id):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pad_token_id = pad_token_id

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokenized_text = self.tokenizer.encode(text).ids
        padded_text = pad_and_truncate(tokenized_text, self.max_length, self.pad_token_id)
        return torch.tensor(padded_text)

test = pd.read_csv(data_path / 'test.csv')
test['text'] = test['text'].apply(preprocess_tweet)

# Create the test dataset
test_dataset = TestDataset(test['text'].tolist(), tokenizer, MAX_SEQ_LEN, pad_token_id)

# Create the test DataLoader
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [18]:
def generate_predictions(model, test_loader):
    model.eval()
    all_probs = []
    with torch.no_grad():
        for texts in tqdm(test_loader):
            texts = texts.to(device)
            predictions = model(texts).squeeze(1)
            probs = torch.sigmoid(predictions)
            all_probs.extend(probs.cpu().numpy())
    return all_probs

# Generate predictions on the test set
test_predictions = generate_predictions(model, test_loader)


100%|██████████| 4/4 [00:00<00:00, 10.54it/s]


In [19]:
submission = pd.DataFrame({'id':test['id'],'target':test_predictions})
submission['target'] = submission['target'].round().astype(int)
submission.to_csv('submission.csv',index=False)
submission.head(5)

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
