In [1]:
import numpy as np
import pandas as pd 
import torch
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import torch.nn as nn
import torch.nn.functional as F
import torchinfo
import time
import re
from torch.nn.utils.rnn import pad_sequence
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import math 
import torchinfo
from torchinfo import summary
from tqdm.notebook import tqdm
from transformers import BertTokenizer


if torch.cuda.is_available():
    device = torch.device("cuda") 
else:
    device = torch.device("cpu")
device

device(type='cuda')

# Introduction

#### In this notebook, I will be performing text classifcation using PyTorch. 
#### I hope this can be a good start to learn some basic NLP techniques.

# Data

In [2]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
sample = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

train_df, val_df = train_test_split(train, test_size=0.3, random_state=42)

train_df = train_df.drop(["keyword", "location"], axis=1)
val_df = val_df.drop(["keyword", "location"], axis=1)
test_df = test.drop(["keyword", "location"], axis=1)

test_df['target'] = 0 # For the custom dataset later on

In [3]:
display(train_df)

Unnamed: 0,id,text,target
1186,1707,Ashes 2015: AustraliaÛªs collapse at Trent Br...,0
4071,5789,GREAT MICHIGAN TECHNIQUE CAMP\nB1G THANKS TO @...,1
5461,7789,CNN: Tennessee movie theater shooting suspect ...,1
5787,8257,Still rioting in a couple of hours left until ...,1
7445,10656,Crack in the path where I wiped out this morni...,0
...,...,...,...
5226,7470,@Eganator2000 There aren't many Obliteration s...,0
5390,7691,just had a panic attack bc I don't have enough...,0
860,1242,Omron HEM-712C Automatic Blood Pressure Monito...,0
7603,10862,Officials say a quarantine is in place at an A...,1


In [4]:
#train_df['text'].apply(len).mean()

#### The whole point of this exercise is to categorize the type of the text.
#### If the text is about a real disaster, it is labelled as 1.
#### If not, it is labelled as 0.

In [5]:
train_df.shape

(5329, 3)

#### We have 5329 total number of training data.

## Custom Dataset

#### Let's create a custom dataset.

In [6]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.df = dataframe        
        
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        
        X = self.df['text'].iloc[idx]
        y = torch.tensor(self.df['target'].iloc[idx], dtype=torch.float)
            
        return X,y

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

for df in [train_df, val_df, test_df]:
    for i in range(df.shape[0]):
        # Removing unnecessary details from the text
        df['text'].iloc[i] = re.sub(r'@\w+\s*', '', df['text'].iloc[i])
        df['text'].iloc[i] = re.sub(r'#\w+\s*', '', df['text'].iloc[i])
        df['text'].iloc[i] = re.sub(r'https?://\S+', '', df['text'].iloc[i])
        
        df['text'].iloc[i] = tokenizer.encode(df['text'].iloc[i], add_special_tokens=True, padding='max_length', max_length=20, return_tensors='pt')[0][:20]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [8]:
train_df['text'].apply(len).mean()

20.0

In [9]:
"""vocab = build_vocab_from_iterator(train_df['text'], min_freq=2)
vocab.insert_token('<unk>', 0) # Unknown string            
vocab.set_default_index(vocab['<unk>'])   
print(f"The length of the vocab is {len(vocab)}")   """

'vocab = build_vocab_from_iterator(train_df[\'text\'], min_freq=2)\nvocab.insert_token(\'<unk>\', 0) # Unknown string            \nvocab.set_default_index(vocab[\'<unk>\'])   \nprint(f"The length of the vocab is {len(vocab)}")   '

#### Working as intended!

In [10]:
label_pipeline = lambda x: [0.,1.] if x == 1 else [1., 0.]

## Custom DataLoader

In [11]:
train_dataset_custom = CustomDataset(train_df)
val_dataset_custom = CustomDataset(val_df)
test_dataset_custom = CustomDataset(test_df)

In [12]:
def collate_batch(batch):
    text_list, label_list = [], []
    for (_text, _label) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(_text, dtype=torch.int64).view(-1)
        text_list.append(processed_text.numpy())
    label_list = torch.tensor(label_list, dtype=torch.int64).view(len(batch),-1)
    text_list = torch.from_numpy(np.array(text_list))
    return text_list.to(device),label_list.to(device)

train_dataloader = DataLoader(train_dataset_custom, batch_size=32, shuffle=True, collate_fn=collate_batch)
val_dataloader = DataLoader(val_dataset_custom, batch_size=32, shuffle=False, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset_custom, batch_size=1, shuffle=False, collate_fn=collate_batch)

In [13]:
text, label = next(iter(train_dataloader))
print(text.shape, label.shape)

  """


torch.Size([32, 20]) torch.Size([32, 2])


In [14]:
text

tensor([[  101,  1045,  2064,  1005,  1056,  3422, 11937,  2102,  2265,  2049,
          2066,  1037,  5469,  3185,  2000,  2033,  1045,  2131, 28945,  2019],
        [  101,  5262,  1998,  1037, 17431, 26136,  2100,  2801,  2000,  2173,
          2012,  1996,  8680,  4765,  2890,  1997,  1037,  2047,  2695,  1011],
        [  101,  6302,  1024,  7198,  1004, 23713,  1025,  2303,  2573, 25536,
          4524,  1999,  2566,  2072, 10105, 19099,  2630,  2007,  6967, 28315],
        [  101,  1996, 12168,  1006,  3521,  2022,  2588,  2032,  1007,  2056,
          1005,  3828,  4426,  2013,  3109, 10273,  2130,  2065,  2009,  2003],
        [  101,  2052,  1005,  2310,  2042,  2307,  2065, 16914,  2072,  4213,
          2074, 16379,  2008,  2210,  9350,  1012,   102,     0,     0,     0],
        [  101,  1996,  6745,  2013,  7657,  2003,  1037,  3035,  1999,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1999,  1037,  3959,  2017,  2

#### Before sending the data to our model, collate_fn function works on a batch of samples generated from DataLoader.
#### Label is a tensor saving the labels of individual text entries.
#### Read more about it [here](https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html).

# Modeling

In [15]:
class TextClassificationModel(nn.Module):

    def __init__(self, vocab_size: int, embedding_dim: int, d_hid: int, dropout: float = 0.2, n_layers: int = 6, bidirectional = False):
        super().__init__()
        self.hidden_size = d_hid
        self.n_layers = n_layers 
        
        self.embeddings = nn.Embedding(num_embeddings = vocab_size, embedding_dim = embedding_dim)
        self.lstm = nn.LSTM(batch_first = True, bidirectional = bidirectional, input_size = embedding_dim, hidden_size = d_hid, dropout = dropout, num_layers = n_layers)
        self.linear_input_size = 2 if bidirectional == True else 1
        self.classifier = nn.Linear(d_hid * self.linear_input_size, 2)  

    def forward(self, src: torch.Tensor) -> torch.Tensor:
        h0 = torch.zeros(self.n_layers * self.linear_input_size, src.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.n_layers * self.linear_input_size, src.size(0), self.hidden_size).to(device)
        
        out = self.embeddings(src)
        out, (h_n, c_n) = self.lstm(out, (h0, c0)) # out - [batch_size, seq_len, d_hid * 2 if bidirectional else d_hid]
        out = out[:, -1, :]
        out = self.classifier(out) # [batch_size, output_size]
        return out

# Training

In [16]:
num_class = 2
emsize = 300
model = TextClassificationModel(vocab_size = tokenizer.vocab_size, embedding_dim = emsize, d_hid = 128, dropout = 0.4, n_layers = 6, bidirectional = True).to(device)

In [17]:
summary(model)

Layer (type:depth-idx)                   Param #
TextClassificationModel                  --
├─Embedding: 1-1                         9,156,600
├─LSTM: 1-2                              2,416,640
├─Linear: 1-3                            514
Total params: 11,573,754
Trainable params: 11,573,754
Non-trainable params: 0

In [18]:
def train(dataloader):
    model.train()
    total_acc, total_count, total_loss = 0, 0, 0
    log_interval = 50
    start_time = time.time()

    for idx, (text, label) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label.float())
        loss.backward()
        optimizer.step()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
        total_acc += (nn.Sigmoid()(predicted_label).argmax(1) == label.argmax(1)).sum().item()
        total_count += label.size(0)
        total_loss += loss.item()
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f} | loss {:8.3f}'.format(epoch, idx, len(dataloader),total_acc/total_count, total_loss/(total_count/label.size(0))))
            total_acc, total_count, total_loss = 0, 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count, total_loss, count = 0, 0, 0, 0

    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label.float())
            total_acc += (nn.Sigmoid()(predicted_label).argmax(1) == label.argmax(1)).sum().item()
            total_count += label.size(0)
            total_loss += loss.item()
            count += 1
    return total_acc/total_count, total_loss/count

def submission(dataloader): # Code for submission
    preds = []
    model.eval()

    with torch.no_grad():
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = nn.Softmax(dim=1)(model(text))
            preds.append(predicted_label.argmax(1).item())
            
    return preds

In [19]:
EPOCHS = 10

criterion = nn.BCEWithLogitsLoss()
lr = 0.005
optimizer = torch.optim.Adam(model.parameters(), lr = lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
total_accu = None

for epoch in tqdm(range(1, EPOCHS + 1)):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val, accu_loss = evaluate(val_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} | valid loss {:8.3f}'.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val, accu_loss))
    print('-' * 59)
    print("\n")

  0%|          | 0/10 [00:00<?, ?it/s]

  """


| epoch   1 |    50/  167 batches | accuracy    0.591 | loss    0.678
| epoch   1 |   100/  167 batches | accuracy    0.562 | loss    0.691
| epoch   1 |   150/  167 batches | accuracy    0.604 | loss    0.675
-----------------------------------------------------------
| end of epoch   1 | time:  5.86s | valid accuracy    0.619 | valid loss    0.626
-----------------------------------------------------------


| epoch   2 |    50/  167 batches | accuracy    0.692 | loss    0.603
| epoch   2 |   100/  167 batches | accuracy    0.701 | loss    0.624
| epoch   2 |   150/  167 batches | accuracy    0.734 | loss    0.579
-----------------------------------------------------------
| end of epoch   2 | time:  3.98s | valid accuracy    0.711 | valid loss    0.581
-----------------------------------------------------------


| epoch   3 |    50/  167 batches | accuracy    0.774 | loss    0.516
| epoch   3 |   100/  167 batches | accuracy    0.786 | loss    0.496
| epoch   3 |   150/  167 batche

# Submission

In [20]:
preds = submission(test_dataloader)

sample['target'] = preds

sample.to_csv('submission.csv', index=False)

  """
