<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_TextCNN/test_sample_TextCNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ! rm -rf PyTorch-Architectures/
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git
%cd PyTorch-Architectures/modeling_TextCNN/

In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece

In [2]:
import time
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from model import TextCNN
from transformers import BertTokenizer
from datasets import load_dataset

In [3]:
dataset = load_dataset("tweets_hate_speech_detection")

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/root/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/c32a982d8b2d6233065d820ac655454174f8aaa8faddc74979cf793486acd3b0)


In [4]:
sentences = []
for sample in dataset['train']:
  text = sample['tweet']
  label = sample['label']
  sentences.append({
      'text': text,
      'label': label,
  })
print('Length of total samples: ', len(sentences))

Length of total samples:  31962


In [5]:
random.shuffle(sentences)

In [6]:
lim = 90 * len(sentences) // 100
train_sentences = sentences[:lim]
valid_sentences = sentences[lim:]

print('Length of Train samples: ', len(train_sentences))
print('Length of Valid samples: ', len(valid_sentences))

Length of Train samples:  28765
Length of Valid samples:  3197


In [7]:
class CustomDataset(Dataset):
  def __init__(self, tokenizer, list_samples, max_input_length=4):
    self.tokenizer = tokenizer
    self.list_samples = list_samples
    self.max_input_length = max_input_length
  
  def __len__(self):
    return len(self.list_samples)
  
  def __getitem__(self, idx):
    samples = self.list_samples[idx]
    texts = samples['text']
    labels = samples['label']
    tokens = self.tokenizer(texts, max_length=self.max_input_length, add_special_tokens=False, padding='max_length', truncation=True, return_tensors='pt')
    input_ids = tokens['input_ids']
    return {
        'ids': input_ids,
        'tgt': torch.tensor(labels),
    }

In [8]:
# Define BERT tokenizer without special [PAD] or [CLS] token in CustomDataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [9]:
# Hyperparameters
VOCAB_SIZE = tokenizer.vocab_size
PAD_IDX = tokenizer.pad_token_id
EMBEDDING_SIZE = 8
NUM_FILTERS = 3
FILTER_SIZES = [2, 2, 2]
NUM_CLASSES = 2
MAX_INPUT_LENGTH = 8
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 3

In [11]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = TextCNN(num_filters=NUM_FILTERS,
                filter_sizes=FILTER_SIZES,
                vocab_size=VOCAB_SIZE,
                embedding_size=EMBEDDING_SIZE,
                sequence_length=MAX_INPUT_LENGTH,
                num_classes=NUM_CLASSES,
                padding_idx=PAD_IDX)
model.to(device)

TextCNN(
  (W): Embedding(30522, 8, padding_idx=0)
  (Weight): Linear(in_features=9, out_features=2, bias=False)
  (filter_list): ModuleList(
    (0): Conv2d(1, 3, kernel_size=(2, 8), stride=(1, 1))
    (1): Conv2d(1, 3, kernel_size=(2, 8), stride=(1, 1))
    (2): Conv2d(1, 3, kernel_size=(2, 8), stride=(1, 1))
  )
)

In [12]:
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Trainable Parameters: ', params)

Trainable Parameters:  244349


In [13]:
train_dataset = CustomDataset(tokenizer, train_sentences, max_input_length=MAX_INPUT_LENGTH)
valid_dataset = CustomDataset(tokenizer, valid_sentences, max_input_length=MAX_INPUT_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("Length of Training Loader: ", len(train_loader))
print("Length of Valid Loader: ", len(valid_loader))

# Sanity check loaders
for sample in train_loader:
  assert sample['ids'].squeeze(1).dim() == 2
  assert sample['tgt'].size(0) == sample['ids'].size(0)
  break

Length of Training Loader:  899
Length of Valid Loader:  100


In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [17]:
def compute_accuracy(model, data_loader, device):
  correct_preds, total_examples = 0, 0
  with torch.set_grad_enabled(False):
    for sample in data_loader:
      ids = sample['ids'].squeeze(1).to(device)
      tgt = sample['tgt']
      logits = model(ids)
      probs = F.softmax(logits, dim=-1)
      _, predicted_labels = torch.max(probs, 1)
      correct_preds += (predicted_labels == tgt).sum()
      total_examples += tgt.size(0)
  return correct_preds / total_examples * 100 

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for batch_idx, sample in enumerate(train_loader):
    ids = sample['ids'].squeeze(1).to(device)
    tgt = sample['tgt'].to(device)

    logits = model(ids)
    loss = F.cross_entropy(logits, tgt)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # LOGGING
    if batch_idx % 200 == 0:
      print('Batch: %04d/%04d || Epoch: %04d/%04d || Loss: %.2f' % (batch_idx, len(train_loader), epoch+1, EPOCHS, loss.item()))

  model.eval()
  with torch.set_grad_enabled(False):
    train_accuracy = compute_accuracy(model, train_loader, device)
    valid_accuracy = compute_accuracy(model, valid_loader, device)
    print('Train Accuracy: %.2f%%' % (train_accuracy.item()))
    print('Valid Accuracy: %.2f%%' % (valid_accuracy.item()))
  epoch_elapsed_time = (time.time() - start_time) / 60
  print('Epoch Elapsed Time: %.2f min' % (epoch_elapsed_time))
total_training_time = (time.time() - start_time) / 60
print('Total Training Time: %.2f min' % (total_training_time))

Batch: 0000/0899 || Epoch: 0001/0003 || Loss: 0.58
Batch: 0200/0899 || Epoch: 0001/0003 || Loss: 0.17
Batch: 0400/0899 || Epoch: 0001/0003 || Loss: 0.14
Batch: 0600/0899 || Epoch: 0001/0003 || Loss: 0.14
Batch: 0800/0899 || Epoch: 0001/0003 || Loss: 0.16
Train Accuracy: 92.98%
Valid Accuracy: 93.06%
Epoch Elapsed Time: 0.70 min
Batch: 0000/0899 || Epoch: 0002/0003 || Loss: 0.43
Batch: 0200/0899 || Epoch: 0002/0003 || Loss: 0.06
Batch: 0400/0899 || Epoch: 0002/0003 || Loss: 0.21
Batch: 0600/0899 || Epoch: 0002/0003 || Loss: 0.06
Batch: 0800/0899 || Epoch: 0002/0003 || Loss: 0.17
Train Accuracy: 93.07%
Valid Accuracy: 93.21%
Epoch Elapsed Time: 1.40 min
Batch: 0000/0899 || Epoch: 0003/0003 || Loss: 0.12
Batch: 0200/0899 || Epoch: 0003/0003 || Loss: 0.10
Batch: 0400/0899 || Epoch: 0003/0003 || Loss: 0.09
Batch: 0600/0899 || Epoch: 0003/0003 || Loss: 0.35
Batch: 0800/0899 || Epoch: 0003/0003 || Loss: 0.19
Train Accuracy: 93.36%
Valid Accuracy: 93.40%
Epoch Elapsed Time: 2.10 min
Total Trai