In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer
from transformers import BertTokenizerFast

from pathlib import Path
plt.style.use('ggplot')

In [2]:
## CONFIG
VOCAB_SIZE = 5000
MIN_FREQUENCY = 10
MAX_SEQ_LEN = 50
BATCH_SIZE = 256
MAX_LR = 5e-4
NUM_EPOCHS = 20
NUM_LSTM_LAYERS = 3

## Load the data and preprocessing

In [3]:
data_path = Path('/kaggle/input/nlp-getting-started')
train = pd.read_csv(data_path / 'train.csv')
train.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
# Preprocess the data
def preprocess_tweet(tweet):
    tweet = tweet.lower()
    # remove urls
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'www\S+', '', tweet)
    
    # remove numbers
    tweet = re.sub(r'[0-9]+(,[0-9])*(\.[0-9]+)*', '', tweet)
    return tweet

train['text'] = train['text'].apply(preprocess_tweet)

## WordPiece Tokenizer

In [5]:
# tokenizer with empty WordPiece model
# unk_token - token for unknown words; using BERT convention
tokenizer = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [6]:
# preprocessing steps
# normalizer: NFD normalization, lower case, strip accents
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)

# pre-tokenizer: split text using whitespace and punctuation
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [7]:
special_tokens = ["[UNK]", "[PAD]"]
trainer = trainers.WordPieceTrainer(
    vocab_size=VOCAB_SIZE,
    min_frequency=MIN_FREQUENCY,
    special_tokens=special_tokens,
    show_progress=True
)

tokenizer.train_from_iterator(train['text'].tolist(), trainer=trainer)

# Optional: specifiy decoder
tokenizer.decoder = decoders.WordPiece(prefix="##")






In [8]:
# Save the tokenizer
tokenizer.save("wordpiece_tokenizer")

## Training and Validation Datasets

In [9]:
# pad_to
pad_token_id = tokenizer.token_to_id("[PAD]")


# Function to pad and truncate sequences
def pad_and_truncate(sequence, max_length, pad_token_id):
    if len(sequence) > max_length:
        return sequence[:max_length]
    return sequence + [pad_token_id] * (max_length - len(sequence))


class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, pad_token_id):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.pad_token_id = pad_token_id

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokenized_text = self.tokenizer.encode(text).ids
        return torch.tensor(tokenized_text), torch.tensor(label)

# Custom collate function for dynamic padding
def collate_fn(batch):
    texts, labels = zip(*batch)
    
    # Compute max length in the batch
    max_length = max(MAX_SEQ_LEN, max(len(text) for text in texts))
    
    # Pad sequences to the max length
    padded_texts = [pad_and_truncate(text.tolist(), max_length, pad_token_id) for text in texts]
    
    # Convert to tensors
    padded_texts = torch.tensor(padded_texts)
    labels = torch.tensor(labels)
    
    return padded_texts, labels

In [10]:
# split into trainining and validation datasets
train_tweets, val_tweets, train_labels, val_labels = train_test_split(
    train['text'].tolist(), train['target'].tolist(), test_size=0.2, random_state=42
)

# Create datasets
train_dataset = TweetDataset(train_tweets, train_labels, tokenizer, pad_token_id)
val_dataset = TweetDataset(val_tweets, val_labels, tokenizer, pad_token_id)

# Create dataloaders with custom collate function
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

## LSTM classifier

In [11]:
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_token_id):
        super(BiLSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_token_id)
        self.lstm = nn.LSTM(
            embedding_dim, hidden_dim, 
            num_layers=NUM_LSTM_LAYERS,
            bidirectional=True, 
            batch_first=True)
        
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.3)
        self.pad_token_id = pad_token_id

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        
        # Create mask for padding tokens
        mask = (text != self.pad_token_id).unsqueeze(2).type(torch.float32)
        
        # Apply mask to LSTM outputs
        masked_lstm_out = lstm_out * mask
        
        # Sum the outputs and divide by the number of valid (non-pad) tokens
        pooled = masked_lstm_out.sum(dim=1) / mask.sum(dim=1)
        
        output = self.dropout(pooled)
        return self.fc(output)


## Training the model

In [12]:
from tqdm import tqdm

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=5):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        model.train()
        
        # Create a progress bar for training
        with tqdm(total=len(train_loader), desc=f'Epoch {epoch + 1}/{epochs}', unit='batch') as pbar:
            for texts, labels in train_loader:
                texts, labels = texts.to(device), labels.to(device).float()
                optimizer.zero_grad()
                predictions = model(texts).squeeze(1)
                loss = criterion(predictions, labels)
                loss.backward()
                optimizer.step()
                scheduler.step()  # Update the learning rate
                
                epoch_loss += loss.item()
                pbar.set_postfix({'Train Loss': epoch_loss / (pbar.n + 1)})
                pbar.update()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            with tqdm(total=len(val_loader), desc='Validating', unit='batch') as pbar:
                for texts, labels in val_loader:
                    texts, labels = texts.to(device), labels.to(device).float()
                    predictions = model(texts).squeeze(1)
                    loss = criterion(predictions, labels)
                    val_loss += loss.item()
                    pbar.set_postfix({'Validation Loss': val_loss / (pbar.n + 1)})
                    pbar.update()
        
        # Log epoch losses
        #print(f'Epoch {epoch + 1}, Train Loss: {epoch_loss / len(train_loader)}, Validation Loss: {val_loss / len(val_loader)}')

In [13]:
# Model parameters
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

embedding_dim = 100
hidden_dim = 128
output_dim = 1
vocab_size = tokenizer.get_vocab_size()

# Set up the model, optimizer, and criterion
model = BiLSTMClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, pad_token_id).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-5)  # Set initial LR for the optimizer
criterion = nn.BCEWithLogitsLoss()

# Setup OneCycleLR
total_steps = len(train_loader) * NUM_EPOCHS # total training steps = num_epochs * batches_per_epoch
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=MAX_LR, total_steps=total_steps)

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=NUM_EPOCHS)

Epoch 1/20: 100%|██████████| 24/24 [00:02<00:00,  8.56batch/s, Train Loss=0.695]
Validating: 100%|██████████| 6/6 [00:00<00:00, 18.62batch/s, Validation Loss=0.692]
Epoch 2/20: 100%|██████████| 24/24 [00:01<00:00, 14.04batch/s, Train Loss=0.688]
Validating: 100%|██████████| 6/6 [00:00<00:00, 21.45batch/s, Validation Loss=0.681]
Epoch 3/20: 100%|██████████| 24/24 [00:01<00:00, 14.09batch/s, Train Loss=0.672]
Validating: 100%|██████████| 6/6 [00:00<00:00, 20.15batch/s, Validation Loss=0.647]
Epoch 4/20: 100%|██████████| 24/24 [00:01<00:00, 13.98batch/s, Train Loss=0.614]
Validating: 100%|██████████| 6/6 [00:00<00:00, 20.94batch/s, Validation Loss=0.596]
Epoch 5/20: 100%|██████████| 24/24 [00:01<00:00, 14.04batch/s, Train Loss=0.547]
Validating: 100%|██████████| 6/6 [00:00<00:00, 20.66batch/s, Validation Loss=0.56]
Epoch 6/20: 100%|██████████| 24/24 [00:01<00:00, 13.94batch/s, Train Loss=0.498]
Validating: 100%|██████████| 6/6 [00:00<00:00, 21.51batch/s, Validation Loss=0.533]
Epoch 7/20:

In [14]:
def evaluate_model(model, val_loader):
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device).float()
            predictions = model(texts).squeeze(1)
            probs = torch.sigmoid(predictions)
            preds = torch.round(probs)
            all_probs.extend(probs.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    roc_auc = roc_auc_score(all_labels, all_probs)
    return accuracy, f1, roc_auc

# Example of evaluating the model
accuracy, f1, roc_auc = evaluate_model(model, val_loader)
print(f'Accuracy: {accuracy:.4f}, F1: {f1:.4f}, ROC AUC: {roc_auc:.4f}')

Accuracy: 0.7492, F1: 0.7084, ROC AUC: 0.8153


## Test predictions and submission

In [15]:
class TestDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length, pad_token_id):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pad_token_id = pad_token_id

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokenized_text = self.tokenizer.encode(text).ids
        padded_text = pad_and_truncate(tokenized_text, self.max_length, self.pad_token_id)
        return torch.tensor(padded_text)

test = pd.read_csv(data_path / 'test.csv')
test['text'] = test['text'].apply(preprocess_tweet)

# Create the test dataset
test_dataset = TestDataset(test['text'].tolist(), tokenizer, MAX_SEQ_LEN, pad_token_id)

# Create the test DataLoader
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

In [16]:
def generate_predictions(model, test_loader):
    model.eval()
    all_probs = []
    with torch.no_grad():
        for texts in tqdm(test_loader):
            texts = texts.to(device)
            predictions = model(texts).squeeze(1)
            probs = torch.sigmoid(predictions)
            all_probs.extend(probs.cpu().numpy())
    return all_probs

# Generate predictions on the test set
test_predictions = generate_predictions(model, test_loader)


100%|██████████| 4/4 [00:00<00:00,  7.17it/s]


In [17]:
submission = pd.DataFrame({'id':test['id'],'target':test_predictions})
submission['target'] = submission['target'].round().astype(int)
submission.to_csv('submission.csv',index=False)
submission.head(5)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
