In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Problem Description 
Twitter has become an important communication channel in times of emergency.
The ubiquitousness of smartphones enables people to announce an emergency they’re observing in real-time. Because of this, more agencies are interested in programatically monitoring Twitter (i.e. disaster relief organizations and news agencies).

In [1]:
pip install torch transformers

Note: you may need to restart the kernel to use updated packages.


# Load required libraries 

In [14]:
import pandas as pd 
from sklearn.model_selection import train_test_split 

from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, AdamW, get_linear_schedule_with_warmup
from tensorflow.keras.optimizers import Adam

from tensorflow.keras.callbacks import EarlyStopping

# Load the data

In [3]:
train_dataset = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_dataset = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

# EDA
* id - a unique identifier for each tweet
* text - the text of the tweet
* location - the location the tweet was sent from (may be blank)
* keyword - a particular keyword from the tweet (may be blank)
* target - in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)

In [18]:
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


*Notice here that this is a imbalanced dataset so we need to do the stratified sampling afterwards* 

In [6]:
train_dataset['target'].value_counts()

target
0    4342
1    3271
Name: count, dtype: int64

# Prepare the Data 

In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    train_dataset['text'],  # Text data
    train_dataset['target'],  # Corresponding labels
    test_size=0.2,  # 20% of the data for validation
    random_state=42,  # Seed for reproducibility
    stratify=train_dataset['target']  # Stratify by the target variable to maintain class balance
)

In [8]:
print(len(X_train), len(y_train))
print(len(X_val), len(y_val))

6090 6090
1523 1523


## Load pre-trained tokenizer and tokenize the dataset

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, max_length=128)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
import keras_nlp

# Load a DistilBERT model.
preset= "distil_bert_base_en_uncased"

# Use a shorter sequence length.
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                   sequence_length=160,
                                                                   name="preprocessor_4_tweets"
                                                                  )

# Pretrained classifier.
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)

classifier.summary()

2024-02-19 16:48:13.161439: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-19 16:48:13.161533: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-19 16:48:13.287102: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Using TensorFlow backend


Attaching 'tokenizer.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'tokenizer.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'assets/tokenizer/vocabulary.txt' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'config.json' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
Attaching 'model.weights.h5' from model 'keras/distil_bert/keras/distil_bert_base_en_uncased/2' to your Kaggle notebook...
  return id(getattr(self, attr)) not in self._functional_layer_ids
  return id(getattr(self, attr)) not in self._functional_layer_ids


## Convert to Tensorflow dataset 

In [9]:
import tensorflow as tf


def create_tf_dataset(encodings, labels):
    dataset = tf.data.Dataset.from_tensor_slices((
        {
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask']
        },
        labels
    ))
    dataset = dataset.shuffle(10000).batch(32)
    return dataset

train_dataset = create_tf_dataset(train_encodings, y_train)
val_dataset = create_tf_dataset(val_encodings, y_val)


## Create PyTorch Datasets

In [19]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Convert the lists to tensors before creating datasets if not already tensors
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.long)
y_val_tensor = torch.tensor(y_val.to_numpy(), dtype=torch.long)

# Create PyTorch datasets
train_dataset = TextDataset(train_encodings, y_train_tensor)
val_dataset = TextDataset(val_encodings, y_val_tensor)


# Configure the model for training 

In [16]:
config = BertConfig.from_pretrained('bert-base-uncased', hidden_dropout_prob=0.5, attention_probs_dropout_prob=0.5)

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)

optimizer = Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

from transformers import get_linear_schedule_with_warmup
import torch

optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 4
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, # Default value
                                            num_training_steps=total_steps)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
for batch in train_loader:
    print({key: val.shape for key, val in batch.items()})

  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)


{'input_ids': torch.Size([32, 84]), 'token_type_ids': torch.Size([32, 84]), 'attention_mask': torch.Size([32, 84]), 'labels': torch.Size([32])}
{'input_ids': torch.Size([32, 84]), 'token_type_ids': torch.Size([32, 84]), 'attention_mask': torch.Size([32, 84]), 'labels': torch.Size([32])}
{'input_ids': torch.Size([32, 84]), 'token_type_ids': torch.Size([32, 84]), 'attention_mask': torch.Size([32, 84]), 'labels': torch.Size([32])}
{'input_ids': torch.Size([32, 84]), 'token_type_ids': torch.Size([32, 84]), 'attention_mask': torch.Size([32, 84]), 'labels': torch.Size([32])}
{'input_ids': torch.Size([32, 84]), 'token_type_ids': torch.Size([32, 84]), 'attention_mask': torch.Size([32, 84]), 'labels': torch.Size([32])}
{'input_ids': torch.Size([32, 84]), 'token_type_ids': torch.Size([32, 84]), 'attention_mask': torch.Size([32, 84]), 'labels': torch.Size([32])}
{'input_ids': torch.Size([32, 84]), 'token_type_ids': torch.Size([32, 84]), 'attention_mask': torch.Size([32, 84]), 'labels': torch.Size

# Train the Model 

In [12]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10,
    callbacks=[early_stopping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [22]:
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss  # Correct way to access the loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
    avg_train_loss = train_loss / len(train_loader)
    print(f'Epoch {epoch + 1}, Train Loss: {avg_train_loss:.4f}')

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss  # Correct way to access the loss
            val_loss += loss.item()
    avg_val_loss = val_loss / len(val_loader)
    print(f'Epoch {epoch + 1}, Validation Loss: {avg_val_loss:.4f}')


  item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
100%|██████████| 191/191 [00:50<00:00,  3.82it/s]


Epoch 1, Train Loss: 0.4423
Epoch 1, Validation Loss: 0.3649


100%|██████████| 191/191 [00:49<00:00,  3.88it/s]


Epoch 2, Train Loss: 0.3065
Epoch 2, Validation Loss: 0.3932


100%|██████████| 191/191 [00:49<00:00,  3.88it/s]


Epoch 3, Train Loss: 0.1966
Epoch 3, Validation Loss: 0.4128


100%|██████████| 191/191 [00:49<00:00,  3.88it/s]


Epoch 4, Train Loss: 0.1197
Epoch 4, Validation Loss: 0.5369


### Keras NLP 

In [17]:
import keras_core as keras

# Compile
classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), #'binary_crossentropy',
    #optimizer=keras.optimizers.Adam(1e-5),
    optimizer=Adam(learning_rate=5e-5),
    metrics= ["accuracy"]  
)
BATCH_SIZE = 32
EPOCHS = 2
# Fit
history = classifier.fit(x=X_train,
                         y=y_train,
                         batch_size=BATCH_SIZE,
                         epochs=EPOCHS, 
                         validation_data=(X_val, y_val)
                        )

Epoch 1/2


I0000 00:00:1708361667.400985     115 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/2


## Make prediciton to the test.csv and save as submission.csv 

In [35]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text for prediction
test_encodings = tokenizer(test_dataset['text'].tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")


In [36]:
class PredictDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

# Create a PyTorch dataset for prediction
predict_dataset = PredictDataset(test_encodings)

In [37]:
from torch.utils.data import DataLoader

# Prepare DataLoader
predict_loader = DataLoader(predict_dataset, batch_size=32)

model.eval()  # Set the model to evaluation mode
predictions = []

with torch.no_grad():
    for batch in predict_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.tolist())

  item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}


In [39]:
import pandas as pd

# Assuming text_dataset['id'] exists and matches the order of your predictions
predictions_df = pd.DataFrame({
    'id': test_dataset['id'],
    'target': predictions
})

# Save to CSV
predictions_df.to_csv('submission_pytorch_bert.csv', index=False)


# Deliverable 2 
## Github Repository 

# Deliverable 3 
## Competition Leaderboard 
