In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


forked and edited from 
https://www.kaggle.com/bkanupam/disaster-tweets-bilstm-fasttext

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import binary_cross_entropy_with_logits, binary_cross_entropy
from torchmetrics import Accuracy, F1


import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer

import re

import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence


import string
import statistics


from sklearn import model_selection
from sklearn import metrics

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Configuration for training

class MODEL_EVAL_METRIC:
    accuracy = "accuracy"
    f1_score = "f1_score"

class Config:
    VOCAB_SIZE = 0
    BATCH_SIZE = 512
    EMB_SIZE = 300
    OUT_SIZE = 2
    NUM_FOLDS = 1
    NUM_EPOCHS = 20
    NUM_WORKERS = 8
    
    # Whether to update the pretrained embedding weights during training process
    EMB_WT_UPDATE = True
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    MODEL_EVAL_METRIC = MODEL_EVAL_METRIC.accuracy
    FAST_DEV_RUN = False 
    
    PATIENCE = 6    
    IS_BIDIRECTIONAL = True
    # model hyperparameters
    MODEL_PARAMS = {
        "hidden_size": 141, 
        "num_layers": 2,         
        "drop_out": 0.4258,
        "lr": 0.000366,
        "weight_decay": 0.00001
    }

DATA_PATH = "/kaggle/input/nlp-getting-started/"    
    
# For results reproducibility 
# sets seeds for numpy, torch, python.random and PYTHONHASHSEED.
pl.seed_everything(42, workers=True)

42

In [5]:
df_train = pd.read_csv(DATA_PATH + "train.csv")
df_test = pd.read_csv(DATA_PATH + "test.csv")
print(f"Rows in train.csv = {len(df_train)}")
print(f"Rows in test.csv = {len(df_test)}")
pd.set_option('display.max_colwidth', None)
df_train.head()

Rows in train.csv = 7613
Rows in test.csv = 3263


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [6]:
# K fold and cross validation 

# split the training dataframe into kfolds for cross validation. We do this before any processing is done
# on the data. We use stratified kfold if the target distribution is unbalanced

def strat_kfold_dataframe(df, target_col_name, num_folds=5):
    # we create a new column called kfold and fill it with -1
    df["kfold"] = -1
    
    # randomize of shuffle the rows of dataframe before splitting is done
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # get the target data
    y = df["target"].values
    
    skf = model_selection.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(skf.split(X=df, y=y)):
        df.loc[val_index, "kfold"] = fold
    return df        

df_train = strat_kfold_dataframe(df_train, target_col_name="target", num_folds=5)   
df_train.head()

Unnamed: 0,id,keyword,location,text,target,kfold
0,3796,destruction,,So you have a new weapon that can cause un-imaginable destruction.,1,2
1,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just got soaked in a deluge going for pads and tampons. Thx @mishacollins @/@,0,1
2,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe CoL police can catch a pickpocket in Liverpool Stree... http://t.co/vXIn1gOq4Q,1,0
3,191,aftershock,,Aftershock back to school kick off was great. I want to thank everyone for making it possible. What a great night.,0,3
4,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts develop a defensive self - one that decreases vulnerability. (3,0,4


In [7]:
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
def clean_special_chars(text, punct):
    for p in punct:
        text = text.replace(p, ' ')
    return text

def process_tweet(df, text, keyword):
    lemmatizer = WordNetLemmatizer()    
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)    
    processed_text = []
    stop = stopwords.words("english")
    for tweet, keyword in zip(df[text], df[keyword]):
        tweets_clean = []        
        # remove stock market tickers like $GE
        #tweet = tweet + " " + keyword
        tweet = re.sub(r'\$\w*', '', tweet)
        # remove old style retweet text "RT"
        tweet = re.sub(r'^RT[\s]+', '', tweet)
        # remove hyperlinks
        tweet = re.sub(r'http\S+', '', tweet)
        # remove hashtags
        # only removing the hash #, @, ... sign from the word
        tweet = re.sub(r'\.{3}|@|#', '', tweet)    
        tweet = clean_special_chars(tweet, punct)
        # remove junk characters which don't have an ascii code
        tweet = tweet.encode("ascii", "ignore").decode("utf-8", "ignore")
        # tokenize tweets        
        tweet_tokens = tokenizer.tokenize(tweet)
        for word in tweet_tokens:
            tweets_clean.append(word)
        processed_text.append(" ".join(tweets_clean))        
    df['processed_text'] = np.array(processed_text)

In [8]:
# Fill in missing values
df_train["keyword"] = df_train["keyword"].fillna("no_keyword")
df_test["keyword"] = df_test["keyword"].fillna("no_keyword")
process_tweet(df_train, 'text', "keyword")
process_tweet(df_test, 'text', "keyword")

# length of the processed tweet
df_train["prcsd_tweet_len"] = df_train["processed_text"].apply(lambda row: len(row.split()))
df_test["prcsd_tweet_len"] = df_test["processed_text"].apply(lambda row: len(row.split()))

df_train.head()

Unnamed: 0,id,keyword,location,text,target,kfold,processed_text,prcsd_tweet_len
0,3796,destruction,,So you have a new weapon that can cause un-imaginable destruction.,1,2,so you have a new weapon that can cause un imaginable destruction,12
1,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just got soaked in a deluge going for pads and tampons. Thx @mishacollins @/@,0,1,the f amp ing things i do for gishwhes just got soaked in a deluge going for pads and tampons thx mishacollins,22
2,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe CoL police can catch a pickpocket in Liverpool Stree... http://t.co/vXIn1gOq4Q,1,0,dt georgegalloway rt galloway 4mayor the col police can catch a pickpocket in liverpool stree,15
3,191,aftershock,,Aftershock back to school kick off was great. I want to thank everyone for making it possible. What a great night.,0,3,aftershock back to school kick off was great i want to thank everyone for making it possible what a great night,21
4,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts develop a defensive self - one that decreases vulnerability. (3,0,4,in response to trauma children of addicts develop a defensive self one that decreases vulnerability 3,16


In [9]:
# https://pytorch.org/text/stable/vocab.html
embedding = torchtext.vocab.GloVe(name='6B', dim=300)

.vector_cache/glove.6B.zip: 862MB [02:43, 5.28MB/s]                           
100%|█████████▉| 399999/400000 [00:58<00:00, 6890.22it/s]


In [10]:
# build tweets vocab from training data
def yield_tokens(df):
    for index, row in df.iterrows():
        # yield turns a function into generator
        # the function will return an iterable object 
        yield row["processed_text"].split()

# https://pytorch.org/text/stable/vocab.html
'''
Build a Vocab from an iterator
Return a Vocab object
torchtext.vocab.build_vocab_from_iterator(iterator: Iterable, 
                                            min_freq: int = 1, 
                                            specials: Optional[List[str]] = None, 
                                            special_first: bool = True) → torchtext.vocab.vocab.Vocab
'''

tweet_vocab = build_vocab_from_iterator(yield_tokens(df_train), specials=["<unk>", "<pad>"])   
Config.VOCAB_SIZE = len(tweet_vocab)
tweet_vocab

Vocab()

In [11]:
# For the problem specific vocab, get the embedding vectors from the pre-trained embedding
# for each word in vocab and return a matrix of shape vocab_size, embedding_dim. This matrix
# will be the pretrained embedding weight matrix which we will use to create the embedding layer


def get_vocab_pt_emb_matrix(text_vocab, emb):
    embedding_matrix = []
    for token in text_vocab.get_itos():
        embedding_matrix.append(emb.get_vecs_by_tokens(token))
    return torch.stack(embedding_matrix)

pt_emb_weights = get_vocab_pt_emb_matrix(tweet_vocab, embedding)
pt_emb_layer = nn.Embedding.from_pretrained(pt_emb_weights)

In [12]:
# vectorize the processed tweet, i.e. replace each token in the tweet with its corresponding index
# in the tweet vocab

'''
lookup_indices(tokens: List[str]) → List[int]
Parameters: tokens – the tokens used to lookup their corresponding indices.
Returns: The 'indices' associated with tokens.
'''

df_train["vectorized_tweet"] = df_train["processed_text"].apply(
    lambda row:torch.LongTensor(tweet_vocab.lookup_indices(row.split()))
    )
df_train.head()

Unnamed: 0,id,keyword,location,text,target,kfold,processed_text,prcsd_tweet_len,vectorized_tweet
0,3796,destruction,,So you have a new weapon that can cause un-imaginable destruction.,1,2,so you have a new weapon that can cause un imaginable destruction,12,"[tensor(34), tensor(12), tensor(25), tensor(3), tensor(54), tensor(300), tensor(16), tensor(48), tensor(275), tensor(1597), tensor(11418), tensor(377)]"
1,3185,deluge,,The f$&amp;@ing things I do for #GISHWHES Just got soaked in a deluge going for pads and tampons. Thx @mishacollins @/@,0,1,the f amp ing things i do for gishwhes just got soaked in a deluge going for pads and tampons thx mishacollins,22,"[tensor(2), tensor(1119), tensor(28), tensor(11534), tensor(474), tensor(7), tensor(72), tensor(11), tensor(10758), tensor(32), tensor(102), tensor(15282), tensor(4), tensor(3), tensor(420), tensor(110), tensor(11), tensor(13619), tensor(8), tensor(15805), tensor(1908), tensor(5859)]"
2,7769,police,UK,DT @georgegalloway: RT @Galloway4Mayor: ÛÏThe CoL police can catch a pickpocket in Liverpool Stree... http://t.co/vXIn1gOq4Q,1,0,dt georgegalloway rt galloway 4mayor the col police can catch a pickpocket in liverpool stree,15,"[tensor(9826), tensor(10707), tensor(174), tensor(10640), tensor(7083), tensor(2), tensor(3768), tensor(77), tensor(48), tensor(2515), tensor(3), tensor(13825), tensor(4), tensor(12365), tensor(15585)]"
3,191,aftershock,,Aftershock back to school kick off was great. I want to thank everyone for making it possible. What a great night.,0,3,aftershock back to school kick off was great i want to thank everyone for making it possible what a great night,21,"[tensor(798), tensor(93), tensor(5), tensor(191), tensor(1677), tensor(98), tensor(26), tensor(206), tensor(7), tensor(155), tensor(5), tensor(549), tensor(226), tensor(11), tensor(633), tensor(14), tensor(567), tensor(56), tensor(3), tensor(206), tensor(256)]"
4,9810,trauma,"Montgomery County, MD",in response to trauma Children of Addicts develop a defensive self - one that decreases vulnerability. (3,0,4,in response to trauma children of addicts develop a defensive self one that decreases vulnerability 3,16,"[tensor(4), tensor(1288), tensor(5), tensor(405), tensor(694), tensor(6), tensor(7327), tensor(5159), tensor(3), tensor(9438), tensor(426), tensor(63), tensor(16), tensor(9412), tensor(16656), tensor(76)]"


In [13]:
class VectorizedTweetDataSet(Dataset):
    def __init__(self, tweet_vecs, labels):
        self.tweet_vecs = tweet_vecs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        each_tweet_vec = self.tweet_vecs[idx]
        each_label = self.labels[idx]
        tweet_len = len(each_tweet_vec)
        return (each_tweet_vec, each_label)

In [14]:
# Get train and validation dataset for 1 fold 

def get_fold_dls(fold, df):
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)
    
    X_train = train_df["vectorized_tweet"].to_numpy()
    y_train = train_df["target"].to_numpy()
    X_valid = valid_df["vectorized_tweet"].to_numpy()
    y_valid = valid_df["target"].to_numpy()
    
    # create dataset and data loader 
    ds_train = VectorizedTweetDataSet(X_train, y_train)
    ds_valid = VectorizedTweetDataSet(X_valid, y_valid)
    dl_train = DataLoader(ds_train, batch_size=Config.BATCH_SIZE, shuffle=True, collate_fn=pad_collate, num_workers=Config.NUM_WORKERS)
    dl_valid = DataLoader(ds_valid, batch_size=Config.BATCH_SIZE, collate_fn=pad_collate, num_workers=Config.NUM_WORKERS)
    return dl_train, dl_valid

In [15]:
# pad input sequence

# If the goal is to train with mini-batches, one needs to pad the sequences in each batch. 
# In other words, given a mini-batch of size N, if the length of the largest sequence is L, 
# one needs to pad every sequence with a length of smaller than L with zeros and make their 
# lengths equal to L. Moreover, it is important that the sequences in the batch are in the 
# descending order.

# This also can be done with BuckectIterator
def pad_collate(batch):
    # Each element in the batch is a tuple (data, label)
    # sort the batch (based on tweet word count) in descending order
    sorted_batch = sorted(batch, key=lambda x:x[0].shape[0], reverse=True)
    sequences = [x[0] for x in sorted_batch]
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    
    # Also need to store the length of each sequence.This is later needed in order to unpad 
    # the sequences
    seq_len = torch.Tensor([len(x) for x in sequences])
    labels = torch.Tensor([x[1] for x in sorted_batch])
    return sequences_padded, seq_len, labels

****Bidirectional RNN

outputs is of size [src len, batch size, hid dim * num directions] where the first hid_dim elements in the third axis are the hidden states from the top layer forward RNN, and the last hid_dim elements are hidden states from the top layer backward RNN. 

We can think of the third axis as being the forward and backward hidden states concatenated together other

hidden is of size [n layers * num directions, batch size, hid dim], where [-2, :, :] gives the top layer forward RNN hidden state after the final time-step (i.e. after it has seen the last word in the sentence) and [-1, :, :] gives the top layer backward RNN hidden state after the final time-step (i.e. after it has seen the first word in the sentence).

In [16]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

class DisasterModel(nn.Module):
    """The RNN model."""
    def __init__(self, vocab_size, num_layers, is_bidirect, emb_size, hidden_size, output_size, 
                pt_emb_weights, emb_wt_update=False, drop_prob=0.5):
        super().__init__()        
        self.vocab_size = vocab_size
        self.num_layers = num_layers 
        
        # size of the embedding vector
        self.emb_size = emb_size
        
        self.hidden_size = hidden_size   
        self.output_dim = output_size
        self.is_bidirect = is_bidirect
        
        # Embedding layer
        self.emb_layer = nn.Embedding(self.vocab_size, emb_size)
        # copy the vocab specific weights(emb vectors) from pretrained embeddings to model embedding layer
        self.emb_layer.weight.data.copy_(pt_emb_weights)    
        # whether to update the pretrained embedding layer weights during model training
        self.emb_layer.weight.requires_grad = emb_wt_update
        
        
        # LSTM Layer        
        self.lstm_layer = nn.LSTM(
                        input_size=emb_size, 
                        hidden_size=hidden_size, 
                        batch_first=True, 
                        bidirectional=is_bidirect, 
                        num_layers=num_layers, 
                        dropout=drop_prob
                        )
        self.dropout = nn.Dropout(p = drop_prob)                        
        
        # If the RNN is bidirectional `num_directions` should be 2, else it should be 1.        
        if not is_bidirect:
            self.num_directions = 1
            self.linear = nn.Linear(self.hidden_size, self.output_dim)
        else:       
            self.num_directions = 2     
            self.linear = nn.Linear(self.hidden_size * self.num_directions, self.output_dim)
            
            
        # The activation layer which converts output to 0 or 1            
        self.act = nn.Sigmoid()            

    def forward(self, inputs, input_lengths, state):        
        # inputs = [batch_size, batch_max_seq_length]        
        # embeds is of shape batch_size * num_steps * emb_dim and is the input to lstm layer
        embeds = self.emb_layer(inputs)        
        batch_size = inputs.shape[0]        
        # embeds = [batch_size, max_seq_length, emb_dim]
        
        # pack_padded_sequence before feeding into LSTM. This is required so pytorch knows
        # which elements of the sequence are padded ones and ignore them in computation.
        # This step is done only after the embedding step
        embeds_pack = pack_padded_sequence(embeds, input_lengths.to("cpu"), batch_first=True)                
        lstm_out_pack, (h_n, c_n) = self.lstm_layer(embeds_pack)
        # h_n and c_n = [batch_size, num_directions * num_layers, hidden_size]
        
        
        # unpack the output
        lstm_out, lstm_out_len = pad_packed_sequence(lstm_out_pack, batch_first=True)        
        # lstm_out = [batch_size, max_seq_length, hidden_size * num_directions]
        
        
        if self.is_bidirect:            
            # each batch item has different seq length, so to select the hidden state at t_end for each batch item
            # a for comprehension like below is needed, a vectorized operation doesn't seem plausible
            # because we do batch_first = True here, 
            #lstm_out = [lstm_out[batch_item_index, seq_length_index-1, :] for batch_item_index, seq_length_index in enumerate(lstm_out_len)]            
            #lstm_out = torch.cat(lstm_out, dim=0).reshape(batch_size, 4 * self.hidden_size)
           
            # Another way to extract the last hidden state for the forward and backward lstm layers
            # in a BiRNN is to use h_n like this
            h_tend_fwd = h_n[-2, :, :]
            h_tend_bwd = h_n[-1, :, :]
            # h_n and c_n = [batch_size, num_directions * num_layers, hidden_size]
            # concat on dimension 1, which is "num_directions * num_layers"
            lstm_out = torch.cat((h_tend_fwd, h_tend_bwd), dim=1)
            
        else:                        
            lstm_out = h_n[-1, :, :]                    
        
        out = self.dropout(lstm_out)                
        output = self.linear(out)        
        
        # apply sigmoid activation to convert output to probability 
        output = self.act(output)
        
        # [batch_size, 2]
        return output

    def init_state(self, batch_size=1):
        """ Initialize the hidden state i.e. initialize all the neurons in all the hidden layers 
        to zero"""
        if not isinstance(self.lstm_layer, nn.LSTM):
            # `nn.GRU` takes a tensor as hidden state
            return torch.zeros((self.num_directions * self.num_layers, batch_size, self.hidden_size))
        else:
            # `nn.LSTM` takes a tuple of hidden states (h0, c0). h0 = initial
            # hidden state for each element in the batch, c0 = initial cell state
            # for each element in the batch
            return (torch.zeros((self.num_directions * self.num_layers, batch_size, self.hidden_size)),
                    torch.zeros((self.num_directions * self.num_layers,batch_size, self.hidden_size)))

In [17]:
# Pytorch lightning wrapper for model

class DisasterTweetLitModel(pl.LightningModule):
    def __init__(self, vocab_size, emb_size, output_size, pt_emb_weights, emb_wt_update, 
                hparams, model_eval_metric=MODEL_EVAL_METRIC.accuracy):
        super().__init__()
        
        #self.save_hyperparameters()
        self.lr = hparams["lr"]
        self.weight_decay = hparams["weight_decay"]
        self.model_eval_metric = model_eval_metric
        self.network = DisasterModel(
            vocab_size = vocab_size,
            num_layers = hparams["num_layers"],
            is_bidirect = Config.IS_BIDIRECTIONAL,
            emb_size = emb_size,
            hidden_size = hparams["hidden_size"],
            output_size = output_size,
            pt_emb_weights = pt_emb_weights,
            emb_wt_update = emb_wt_update,
            drop_prob = hparams["drop_out"]
        )

    def forward(self, tweets, tweet_lengths, state):
        return self.network(tweets, tweet_lengths, state)

    def configure_optimizers(self):
        model_optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, weight_decay=self.weight_decay)
        # https://pytorch.org/docs/stable/optim.html
        lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(model_optimizer, mode="min")
        return {
            "optimizer": model_optimizer,
            "lr_scheduler": {
                "scheduler": lr_scheduler,
                "monitor": "val_loss",
                "frequency": 1
            }
        }

    def training_step(self, batch, batch_idx):
        tweets, tweet_lengths, targets = batch
        # initialize the hidden and cell state of the LSTM
        h0, c0 = self.network.init_state()
        
        targets_pred = self(tweets, tweet_lengths, (h0, c0))        
 
        loss_targets = F.one_hot(targets.T.long(), num_classes=2)
        loss_targets = loss_targets.float()        
        train_loss = binary_cross_entropy(targets_pred, loss_targets)
        
        train_metric = None
        train_metric_str = ""
        
        if self.model_eval_metric == MODEL_EVAL_METRIC.accuracy:            
            targets_pred = torch.argmax(targets_pred, dim=1)            
            train_metric = Accuracy(num_classes=2)(targets_pred.cpu(), targets.long().cpu())
            train_metric_str = "train_acc"
        elif self.model_eval_metric == MODEL_EVAL_METRIC.f1_score:
            train_metric = F1(targets_pred, targets)            
            train_metric_str = "train_f1"
            
        self.log("train_loss", train_loss, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        self.log(train_metric_str, train_metric, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        return train_loss

    def validation_step(self, batch, batch_idx):
        tweets, tweet_lengths, targets = batch
        # initialize the hidden and cell state of the LSTM
        h0, c0 = self.network.init_state()
        
        targets_pred = self(tweets, tweet_lengths, (h0, c0))
        loss_targets = F.one_hot(targets.T.long(), num_classes=2)
        loss_targets = loss_targets.float()        
        
        val_loss = binary_cross_entropy(targets_pred, loss_targets)
        val_metric = None
        val_metric_str = ""
        
        if self.model_eval_metric == MODEL_EVAL_METRIC.accuracy:
            targets_pred = torch.argmax(targets_pred, dim=1)
            val_metric = Accuracy(num_classes=2)(targets_pred.cpu(), targets.long().cpu())
            val_metric_str = "val_acc"
        elif self.model_eval_metric == MODEL_EVAL_METRIC.f1_score:
            val_metric = F1(targets_pred, targets)            
            val_metric_str = "val_f1"
            
        self.log("val_loss", val_loss, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        self.log(val_metric_str, val_metric, prog_bar=True, logger=True, on_epoch=True, on_step=True)
        return val_loss

In [18]:
# Custom lightning callback
# To record training and validation metric values at each epoch and the best metric values across all epochs

from pytorch_lightning.callbacks import Callback
from pytorch_lightning import LightningModule, Trainer
# Monitor multiple metric values that are calculated either in training or validation step and return the
# best metric values for each epoch


class MetricsAggCallback(Callback):
    def __init__(self, train_metrics_to_monitor, val_metrics_to_monitor):
        # dictionary with metric name as key and monitor mode (min, max) as the value
        # ( the same names used to log metric values in training and validation step)
        self.val_metrics_to_monitor = val_metrics_to_monitor
        self.train_metrics_to_monitor = train_metrics_to_monitor
        
        
        # dictionary with metric_name as key and list of metric value for each epoch
        self.train_metrics = {metric: [] for metric in train_metrics_to_monitor.keys()}
        self.val_metrics = {metric: [] for metric in val_metrics_to_monitor.keys()}
        
        
        # dictionary with metric_name as key and the best metric value for all epochs
        self.train_best_metric = {metric: None for metric in train_metrics_to_monitor.keys()}
        self.val_best_metric = {metric: None for metric in val_metrics_to_monitor.keys()}
        
        
        # dictionary with metric_name as key and the epoch number with the best metric value
        self.train_best_metric_epoch = {metric: None for metric in train_metrics_to_monitor.keys()}     
        self.val_best_metric_epoch = {metric: None for metric in val_metrics_to_monitor.keys()}     
        self.epoch_counter = 0           

    @staticmethod
    def process_metrics(metrics_to_monitor, metrics, best_metric, best_metric_epoch, trainer):
        metric_str = ""
        for metric, mode in metrics_to_monitor.items():
            metric_value = round(trainer.callback_metrics[metric].cpu().detach().item(), 4)            
            metric_str += f"{metric} = {metric_value}, "
            metrics[metric].append(metric_value)
            if mode == "max":
                best_metric[metric] = max(metrics[metric])            
            elif mode == "min":            
                best_metric[metric] = min(metrics[metric])            
            best_metric_epoch[metric] = metrics[metric].index(best_metric[metric]) 
        print(metric_str[:-2])

    def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule):
        self.epoch_counter += 1        
        self.process_metrics(self.train_metrics_to_monitor, self.train_metrics, self.train_best_metric, self.train_best_metric_epoch, trainer)

    def on_validation_epoch_end(self, trainer: Trainer, pl_module: LightningModule):        
        print(f"For epoch {self.epoch_counter}")
        self.process_metrics(self.val_metrics_to_monitor, self.val_metrics, self.val_best_metric, self.val_best_metric_epoch, trainer)

In [19]:
def run_training(fold, dl_train, dl_val, pt_emb_weights, find_lr=True):
    fold_str = f"fold{fold}"
    print(f"Running training for {fold_str}")
    
    disaster_tweet_model = DisasterTweetLitModel(
        vocab_size=Config.VOCAB_SIZE,
        emb_size=Config.EMB_SIZE,
        output_size=Config.OUT_SIZE,
        pt_emb_weights=pt_emb_weights,
        emb_wt_update=Config.EMB_WT_UPDATE,
        hparams=Config.MODEL_PARAMS,
        model_eval_metric=Config.MODEL_EVAL_METRIC                
        )
    
    tb_logger = pl.loggers.TensorBoardLogger(save_dir="logs")    
    
    chkpt_file_name = fold_str + "_best_model_{epoch}_{val_loss:.4f}"
    train_metrics_to_monitor = {
        "train_loss": "min",
        "train_acc": "max"
    }
    
    val_metrics_to_monitor = {
        "val_loss": "min",
        "val_acc": "max",
        }
    
    loss_chkpt_callback = ModelCheckpoint(dirpath="./model", verbose=True, monitor="val_loss", mode="min", filename=chkpt_file_name)   
    
    metric_chkpt_callback = MetricsAggCallback(train_metrics_to_monitor, val_metrics_to_monitor)
    
    early_stopping_callback = EarlyStopping(monitor="val_loss", patience=Config.PATIENCE, mode="min", verbose=True)
    
    trainer = pl.Trainer(
        gpus = 1,
        deterministic = True,
        auto_select_gpus = True,
        progress_bar_refresh_rate = 20,
        max_epochs = Config.NUM_EPOCHS,
        logger = tb_logger,
        auto_lr_find = True,    
        #precision = Config.PRECISION,   
        fast_dev_run = Config.FAST_DEV_RUN, 
        gradient_clip_val = 1.0,        
        callbacks = [loss_chkpt_callback, metric_chkpt_callback, early_stopping_callback]
    )    
    
    if find_lr:
        trainer.tune(model=disaster_tweet_model, train_dataloaders=dl_train)
        print(disaster_tweet_model.lr)
        
        
    trainer.fit(disaster_tweet_model, train_dataloaders=dl_train, val_dataloaders=dl_val)
    
    
    fold_train_metrics = {
        metric: (metric_chkpt_callback.train_best_metric[metric], metric_chkpt_callback.train_best_metric_epoch[metric]) 
        for metric in train_metrics_to_monitor.keys()
    }
    
    fold_val_metrics = {
        metric: (metric_chkpt_callback.val_best_metric[metric], metric_chkpt_callback.val_best_metric_epoch[metric]) 
        for metric in val_metrics_to_monitor.keys()
    }   
    
    best_model = loss_chkpt_callback.best_model_path
    del trainer, disaster_tweet_model, loss_chkpt_callback, metric_chkpt_callback 
    return fold_train_metrics, fold_val_metrics, best_model

In [20]:
find_lr = True
all_fold_val_loss = []
all_fold_val_acc = []

for fold in range(Config.NUM_FOLDS):
    dl_train, dl_val = get_fold_dls(fold, df_train)
    fold_train_metrics, fold_val_metrics, chkpt_file_name = run_training(fold, dl_train, dl_val, pt_emb_weights, find_lr=False)    
    all_fold_val_loss.append((fold_val_metrics["val_loss"][0], chkpt_file_name))
    all_fold_val_acc.append(fold_val_metrics["val_acc"][0])
    
    print(f"Best train metrics values for fold{fold}")    
    print(fold_train_metrics)
    print(f"Best val metrics values for fold{fold}")    
    print(fold_val_metrics)     

  cpuset_checked))


Running training for fold0


Validation sanity check: 0it [00:00, ?it/s]

  cpuset_checked))


For epoch 0
val_loss = 0.6905, val_acc = 0.5703


  f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

For epoch 0
val_loss = 0.6541, val_acc = 0.5699
train_loss = 0.6765, train_acc = 0.5698


Validating: 0it [00:00, ?it/s]

For epoch 1
val_loss = 0.5549, val_acc = 0.7781
train_loss = 0.6119, train_acc = 0.6486


Validating: 0it [00:00, ?it/s]

For epoch 2
val_loss = 0.4963, val_acc = 0.7603
train_loss = 0.518, train_acc = 0.7691


Validating: 0it [00:00, ?it/s]

For epoch 3
val_loss = 0.442, val_acc = 0.8037
train_loss = 0.4521, train_acc = 0.8015


Validating: 0it [00:00, ?it/s]

For epoch 4
val_loss = 0.4253, val_acc = 0.8076
train_loss = 0.4132, train_acc = 0.823


Validating: 0it [00:00, ?it/s]

For epoch 5
val_loss = 0.4189, val_acc = 0.8201
train_loss = 0.3887, train_acc = 0.8355


Validating: 0it [00:00, ?it/s]

For epoch 6
val_loss = 0.4127, val_acc = 0.8227
train_loss = 0.3641, train_acc = 0.8506


Validating: 0it [00:00, ?it/s]

For epoch 7
val_loss = 0.4164, val_acc = 0.8214
train_loss = 0.3349, train_acc = 0.8678


Validating: 0it [00:00, ?it/s]

For epoch 8
val_loss = 0.4334, val_acc = 0.8181
train_loss = 0.3121, train_acc = 0.877


Validating: 0it [00:00, ?it/s]

For epoch 9
val_loss = 0.4501, val_acc = 0.807
train_loss = 0.2884, train_acc = 0.8892


Validating: 0it [00:00, ?it/s]

For epoch 10
val_loss = 0.4659, val_acc = 0.8214
train_loss = 0.2565, train_acc = 0.9051


Validating: 0it [00:00, ?it/s]

For epoch 11
val_loss = 0.5063, val_acc = 0.8135
train_loss = 0.226, train_acc = 0.9192


Validating: 0it [00:00, ?it/s]

For epoch 12
val_loss = 0.5876, val_acc = 0.7932
train_loss = 0.1924, train_acc = 0.9335
Best train metrics values for fold0
{'train_loss': (0.1924, 12), 'train_acc': (0.9335, 12)}
Best val metrics values for fold0
{'val_loss': (0.4127, 7), 'val_acc': (0.8227, 7)}


In [21]:
all_fold_val_loss

[(0.4127,
  '/kaggle/working/model/fold0_best_model_epoch=6_val_loss=0.4127.ckpt')]

In [22]:
fold_val_loss_sorted = sorted(all_fold_val_loss, key=lambda x:x[0])
all_fold_val_loss = [item[0] for item in all_fold_val_loss]
print(f"val loss across folds = {all_fold_val_loss}")
print(f"val accuracy across folds = {all_fold_val_acc}")


mean_loss = statistics.mean(all_fold_val_loss)
mean_acc = statistics.mean(all_fold_val_acc)
#std_loss = statistics.stdev(all_fold_val_loss)
#std_acc = statistics.stdev(all_fold_val_acc)
#print(f"mean val loss across folds = {mean_loss}, val loss stdev across fold = {std_loss}")
#print(f"mean val accuracy across folds = {mean_acc}, val accuracy stdev across fold = {std_acc}")

val loss across folds = [0.4127]
val accuracy across folds = [0.8227]


In [23]:
all_fold_val_loss

[0.4127]

In [24]:
fold_val_loss_sorted

[(0.4127,
  '/kaggle/working/model/fold0_best_model_epoch=6_val_loss=0.4127.ckpt')]

In [25]:
import os
os.getcwd()

'/kaggle/working'

In [26]:
best_model_across_folds = fold_val_loss_sorted[0][1]
print(f"Using best model = {best_model_across_folds} for prediction on test set")

best_model = DisasterTweetLitModel.load_from_checkpoint(
    checkpoint_path=best_model_across_folds,
    vocab_size=Config.VOCAB_SIZE,
    emb_size=Config.EMB_SIZE,
    output_size=Config.OUT_SIZE,
    pt_emb_weights=pt_emb_weights,
    emb_wt_update=Config.EMB_WT_UPDATE,
    hparams=Config.MODEL_PARAMS,
    model_eval_metric=Config.MODEL_EVAL_METRIC 
    )


tweet_vocab.set_default_index(0)
df_test["vectorized_tweet"] = df_test["processed_text"].apply(
    lambda row:torch.LongTensor(tweet_vocab.lookup_indices(row.split()))
    )

Using best model = /kaggle/working/model/fold0_best_model_epoch=6_val_loss=0.4127.ckpt for prediction on test set


In [27]:
# Do prediction with best performing model on the test set
def predict(df_test):
    test_output = []
    for index, row in df_test.iterrows():    
        vec_tweet = row["vectorized_tweet"]
        if len(vec_tweet) == 0:
            test_output.append(0)
            continue
            
        vec_tweet_len = torch.IntTensor([len(vec_tweet)])
        vec_tweet = vec_tweet.view(1, -1)    
        
        output = best_model(vec_tweet, vec_tweet_len, state=None)
        
        test_output.append(torch.argmax(output).item())    
    return test_output        

test_output = predict(df_test)
print(test_output)

[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [28]:
df_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')
df_submission['target']= test_output
df_submission.to_csv('submission.csv',index=False)