In [4]:
import re
import torch

import numpy as np
import pandas as pd
import pytorch_lightning as pl
import torch.nn as nn

from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, Subset
from tqdm.notebook import tnrange, tqdm
from transformers import BertTokenizer, BertModel

In [5]:
# Constants
BATCH_SIZE = 8
ACCUMULATE_GRADIENTS = 4
EPOCHS = 7
LEARNING_RATE = 0.000001

AVAIL_GPUS = min(1, torch.cuda.device_count())
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
if use_cuda:
    print(f'Using {device}!')

Using cuda!


In [6]:
# Read dataset
df_dataset = pd.read_csv('train.csv')

df_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
# Check balance of classes
df_dataset.target.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [8]:
# Clean text
pattern_url = re.compile(r'https?://\S+|www\.\S+')
pattern_html = re.compile(r'<.*?>')

def clean_text(text):
    cleaned_text = text.strip()
    cleaned_text = pattern_url.sub(r'', cleaned_text)
    cleaned_text = cleaned_text.replace('#', '')
    
    return cleaned_text

df_dataset['text'] = df_dataset['text'].apply(clean_text)

In [9]:
# Get only the texts and targets
df_train = df_dataset[['text','target']]

# Split dataset
X_train, X_testing, y_train, y_testing = train_test_split(df_train.text, df_train.target, train_size=0.80, stratify=df_train.target)
X_valid, X_test, y_valid, y_test = train_test_split(X_testing, y_testing, test_size=0.5, stratify=y_testing)

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = BertModel.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [80]:
# Tokenize in Dataset, do NOT return tensor, and pad them in collator + transform in tensor
class TweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer):
        # self.tweets = tweets.tolist()
        self.tweets = [tokenizer(tweet, max_length = 512, truncation=True, add_special_tokens=True) for tweet in tweets]
        self.labels = labels.tolist()
        
    def __len__(self):
        return len(self.tweets)
    
    def __getitem__(self, idx):
        return self.tweets[idx], self.labels[idx]
    
class TweetCollator(object):
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
        
    def __call__(self, batch_data):
        batch_input_ids, batch_attention_masks, batch_labels = [], [], []
        for tokenized_tweet,label in batch_data:
            batch_input_ids.append(torch.LongTensor(tokenized_tweet['input_ids']))
            batch_attention_masks.append(torch.LongTensor(tokenized_tweet['attention_mask']))
            batch_labels.append(label)
            
        return {
            'batch_input_ids': pad_sequence(batch_input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id),
            'batch_attention_masks': pad_sequence(batch_attention_masks, batch_first=True, padding_value=0),
            'batch_labels': torch.FloatTensor(batch_labels)
        }
    
# Datasets
trainDataset = TweetDataset(X_train, y_train, tokenizer)
validDataset = TweetDataset(X_valid, y_valid, tokenizer)
testDataset = TweetDataset(X_test, y_test, tokenizer)

# Collator
tweetCollator = TweetCollator(tokenizer)

# DataLoader
trainDataloader = DataLoader(dataset=trainDataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=tweetCollator)
validDataloader = DataLoader(dataset=validDataset, batch_size=BATCH_SIZE, collate_fn=tweetCollator)
testDataloader = DataLoader(dataset=testDataset, batch_size=BATCH_SIZE, collate_fn=tweetCollator)

In [81]:
class TweetModel(pl.LightningModule):
    def __init__(self, tokenizer, bert_model):
        super().__init__()
        
        self.bert = bert_model.to(device=device)
        self.classifier = nn.Sequential(
            nn.Dropout(p=0.5),
            nn.Linear(in_features=768, out_features=1),
            nn.Sigmoid()
        )
        
        self.critrion = nn.BCELoss()
        
        self.y_valid = []
        self.y_hat_valid = []
        self.valid_loss = []
        
        self.y_test = []
        self.y_hat_test = []
        self.test_loss = []
    
    def forward(self, input_ids, attention_masks):
        _, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_masks, return_dict=False)
        
        return self.classifier(pooled_output)
    
    def training_step(self, train_batch, batch_idx):
        # X
        input_ids = train_batch['batch_input_ids'].to(device=device)
        attention_mask = train_batch['batch_attention_masks'].to(device=device)
        # y
        batch_labels = train_batch['batch_labels'].to(device=device, dtype=torch.float32)
        
        pred = self(input_ids, attention_mask)
            
        loss = self.critrion(pred.squeeze(1), batch_labels)
        
        self.log('train_loss', loss)
  
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        # X
        input_ids = val_batch['batch_input_ids'].to(device=device)
        attention_mask = val_batch['batch_attention_masks'].to(device=device)
        # y
        batch_labels = val_batch['batch_labels'].to(device=device, dtype=torch.float32)
        
        pred = self(input_ids, attention_mask)
            
        loss = self.critrion(pred.squeeze(1), batch_labels)
        
        self.y_valid.extend(batch_labels.cpu().numpy())
        self.y_hat_valid.extend(pred.cpu().numpy().reshape(1,-1).squeeze(0))
        self.valid_loss.append(loss.detach().cpu().numpy())     
        
        return {"loss": loss}
    
    def validation_epoch_end(self, outputs):
        # Compute mean valid loss
        mean_val_loss = sum(self.valid_loss) / len(self.valid_loss)
        
        # Compute accuracy
        pred = (np.array(self.y_hat_valid) >= 0.5)
        acc = 100 * np.sum(pred == np.array(self.y_valid)) / len(self.y_valid)
        
        self.log('val_loss', mean_val_loss, prog_bar=True)
        self.log('val_acc', acc, prog_bar=True)
        
        self.y_valid = []
        self.y_hat_valid = []
        self.valid_loss = []
        
    def test_step(self, test_batch, batch_idx):
        # X
        input_ids = test_batch['batch_input_ids'].to(device=device)
        attention_mask = test_batch['batch_attention_masks'].to(device=device)
        # y
        batch_labels = test_batch['batch_labels'].to(device=device, dtype=torch.float32)
        
        pred = self(input_ids, attention_mask)
            
        loss = self.critrion(pred.squeeze(1), batch_labels)

        self.y_test.extend(batch_labels.cpu().numpy())
        self.y_hat_test.extend(pred.cpu().numpy().reshape(1,-1).squeeze(0))
        self.test_loss.append(loss.detach().cpu().numpy())
        
        return {"loss": loss}
        
    def test_epoch_end(self, outputs):
         # Compute mean test loss
        mean_test_loss = sum(self.test_loss) / len(self.test_loss)
        
        # Compute accuracy
        pred = (np.array(self.y_hat_test) >= 0.5)
        acc = 100 * np.sum(pred == np.array(self.y_test)) / len(self.y_test)
        
        self.log('test_loss', mean_test_loss, prog_bar=True)
        self.log('test_acc', acc, prog_bar=True)
        
        self.y_test = []
        self.y_hat_test = []
        self.test_loss = []
        
    def predict_step(self, tweet_tokenized):
        input_ids = torch.LongTensor(tweet_tokenized['input_ids']).to(device=device).unsqueeze(0)
        attention_mask = torch.ones_like(input_ids).to(device=device).unsqueeze(0)
        
        pred = self(input_ids, attention_mask)
        pred = (pred >= 0).squeeze(0)
        pred = pred[0].int().item()
        
        return pred
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=LEARNING_RATE)
        return optimizer

    def set_device(self, device):
        self.to_device = device
        self.to(device)

In [82]:
# Declare model
tweet_classifier = TweetModel(tokenizer=tokenizer, bert_model=bert_model)

In [83]:
# Declare trainer
earlystop_callback = pl.callbacks.EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5, verbose=True, mode="min")

trainer = pl.Trainer(
    gpus=AVAIL_GPUS,
    max_epochs=EPOCHS,
    accumulate_grad_batches=ACCUMULATE_GRADIENTS,
    gradient_clip_val=1,
    callbacks=[earlystop_callback],
    num_sanity_val_steps=0
)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [84]:
# Training loop
trainer.fit(tweet_classifier, trainDataloader, validDataloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type       | Params
------------------------------------------
0 | bert       | BertModel  | 108 M 
1 | classifier | Sequential | 769   
2 | critrion   | BCELoss    | 0     
------------------------------------------
108 M     Trainable params
0         Non-trainable params
108 M     Total params
433.244   Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Metric val_loss improved. New best score: 0.451


Validation: 0it [00:00, ?it/s]

Metric val_loss improved by 0.032 >= min_delta = 0.001. New best score: 0.419


Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Monitored metric val_loss did not improve in the last 5 records. Best score: 0.419. Signaling Trainer to stop.


In [85]:
# Evaluate
trainer.test(dataloaders=testDataloader)

  f"`.{fn}(ckpt_path=None)` was called without a model."
Restoring states from the checkpoint path at /content/lightning_logs/version_9/checkpoints/epoch=6-step=1337.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /content/lightning_logs/version_9/checkpoints/epoch=6-step=1337.ckpt


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_acc             80.83989501312335
        test_loss           0.5049834690677623
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


[{'test_acc': 80.83989501312335, 'test_loss': 0.5049834690677623}]

In [86]:
# Submit result
tweet_classifier.set_device('cuda' if use_cuda else 'cpu')
df_submission_in = pd.read_csv('test.csv')

x_sub = df_submission_in['text'].apply(clean_text)

tweets_sub = [tokenizer(tweet, max_length = 512, truncation=True, add_special_tokens=True) for tweet in x_sub]
    
y_pred_sub = [tweet_classifier.predict_step(tokenized_tweet) for tokenized_tweet in tweets_sub]

df_submission_out = pd.read_csv('sample_submission.csv')

df_submission_out['target'] = y_pred_sub

df_submission_out.to_csv('submission.csv', index=False)