<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_XLNet/test_sample_XLNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! nvidia-smi

In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece

In [None]:
# ! rm -rf PyTorch-Architectures/
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git
%cd PyTorch-Architectures/modeling_XLNet/

In [2]:
import time
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import XLNetTokenizer
from model import XLNetClassify
from config import XLNetConfig

In [3]:
dataset = load_dataset('tweets_hate_speech_detection')

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/root/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/b85ae55489e4a8c3531632a1b4e654546689115add2a15f8bbf0ecbd779ef3ff)


In [4]:
sentences = []
labels = []
for data in dataset['train']:
  sentences.append(data['tweet'])
  labels.append(data['label'])
  
assert len(sentences) == len(labels)
print('Total Samples: ', len(sentences))

Total Samples:  31962


In [5]:
class CustomDataset(Dataset):
  def __init__(self, tokenizer, list_sentences, labels=None, max_len=16):
    self.tokenizer = tokenizer
    self.list_sentences = list_sentences
    self.labels = labels
    self.max_len = max_len
  
  def __len__(self):
    return len(self.list_sentences)
  
  def __getitem__(self, idx):
    texts = self.list_sentences[idx]
    tokens = tokenizer(texts, max_length=self.max_len,
                        padding='max_length', truncation=True,
                        return_tensors='pt')
    if self.labels is not None:
      tgt = torch.tensor(self.labels[idx])
    else:
      tgt = None
    
    return {
        'inp_ids': tokens['input_ids'],
        'inp_mask': tokens['attention_mask'],
        'tgt_ids' : tgt,
    }

In [6]:
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

In [7]:
sample_sentences = sentences[:100]
sample_labels = labels[:100]
sample_dataset = CustomDataset(tokenizer,
                               list_sentences=sample_sentences,
                               labels=sample_labels,
                               max_len=16)
sample_loader = DataLoader(dataset=sample_dataset,
                           batch_size=2,
                           shuffle=False)
for sample in sample_loader:
  assert sample['inp_ids'].squeeze(1).dim() == 2
  assert sample['inp_ids'].squeeze(1).size(1) == sample['inp_mask'].squeeze(1).size(1)
  assert sample['tgt_ids'].size(0) == sample['inp_ids'].size(0)
  break

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
scaler = torch.cuda.amp.GradScaler()
config = XLNetConfig()
config.n_layer = 8
model = XLNetClassify(config)
model.to(device)

In [9]:
params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Trainable Parameters: ', params)

Trainable Parameters:  91581442


In [10]:
split = 90 * len(sentences) // 100
train_sentences = sentences[:split]
train_labels = labels[:split]
assert len(train_sentences) == len(train_labels)

valid_sentences = sentences[split:]
valid_labels = labels[split:]
assert len(valid_sentences) == len(valid_labels)

print('Train Samples: ', len(train_sentences))
print('Valid Samples: ', len(valid_sentences))

Train Samples:  28765
Valid Samples:  3197


In [11]:
# Space for Hyperparameters for training
BATCH_SIZE = 32
MAX_INP_LEN = 16
LEARNING_RATE = 3e-5
EPOCHS = 2

In [12]:
train_dataset = CustomDataset(tokenizer,
                              list_sentences=train_sentences,
                              labels=train_labels,
                              max_len=MAX_INP_LEN)

valid_dataset = CustomDataset(tokenizer,
                              list_sentences=valid_sentences,
                              labels=valid_labels,
                              max_len=MAX_INP_LEN)

In [13]:
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=False)
valid_loader = DataLoader(dataset=valid_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=False)
print('Train Loader: ', len(train_loader), " samples")
print('Valid Loader: ', len(valid_loader), " samples")

Train Loader:  899  samples
Valid Loader:  100  samples


In [14]:
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [15]:
# Sanity check forward pass
model.eval()
with torch.set_grad_enabled(False):
  for sample in train_loader:
    input_ids = sample['inp_ids'].squeeze(1).to(device)
    attention_mask = sample['inp_mask'].squeeze(1).to(device)
    target_ids = sample['tgt_ids'].to(device)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                    labels=target_ids)
    print(outputs[0].shape, outputs[1])
    break

torch.Size([32, 2]) tensor(1.0111, device='cuda:0')


In [16]:
def compute_accuracy(model, data_loader, device):
  correct_preds, num_examples = 0, 0
  with torch.set_grad_enabled(False):
    for sample in data_loader:
      input_ids = sample['inp_ids'].squeeze(1).to(device)
      attention_mask = sample['inp_mask'].squeeze(1).to(device)
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      target_ids = sample['tgt_ids'].to(device)
      logits = outputs[0]
      prob = F.softmax(logits, dim=-1)
      _, preds = torch.max(prob, dim=1)
      correct_preds += (preds == target_ids).sum()
      num_examples += target_ids.size(0)
  return correct_preds.float() / num_examples * 100

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for idx, sample in enumerate(train_loader):
    input_ids = sample['inp_ids'].squeeze(1).to(device)
    attention_mask = sample['inp_mask'].squeeze(1).to(device)
    target_ids = sample['tgt_ids'].unsqueeze(0).to(device)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask, 
                    labels=target_ids)

    optimizer.zero_grad()
    with torch.cuda.amp.autocast():
      loss = outputs[1]
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

    # LOGGING
    if idx % 200 == 0:
      print('Batch: %04d/%04d || Epoch: %04d/%04d || Loss: %.2f' % (idx,
                                                                    len(train_loader),
                                                                    epoch+1,
                                                                    EPOCHS,
                                                                    loss.item()))
  model.eval()
  with torch.set_grad_enabled(False):
    train_acc = compute_accuracy(model, train_loader, device)
    valid_acc = compute_accuracy(model, valid_loader, device)
    print('Train Accuracy: %.2f%% || Valid Accuracy: %.2f%%' % (train_acc,
                                                                valid_acc))
  epoch_elapsed_time = (time.time() - start_time) / 60
  print('Epoch Elapsed Time: %.2f min' % (epoch_elapsed_time))
total_training_time = (time.time() - start_time) / 60
print('Total Training Time: %.2f min' % (total_training_time))

Batch: 0000/0899 || Epoch: 0001/0002 || Loss: 0.91
Batch: 0200/0899 || Epoch: 0001/0002 || Loss: 0.08
Batch: 0400/0899 || Epoch: 0001/0002 || Loss: 0.16
Batch: 0600/0899 || Epoch: 0001/0002 || Loss: 0.24
Batch: 0800/0899 || Epoch: 0001/0002 || Loss: 0.30
Train Accuracy: 95.84% || Valid Accuracy: 94.59%
Epoch Elapsed Time: 2.11 min
Batch: 0000/0899 || Epoch: 0002/0002 || Loss: 0.14
Batch: 0200/0899 || Epoch: 0002/0002 || Loss: 0.04
Batch: 0400/0899 || Epoch: 0002/0002 || Loss: 0.10
Batch: 0600/0899 || Epoch: 0002/0002 || Loss: 0.14
Batch: 0800/0899 || Epoch: 0002/0002 || Loss: 0.34
Train Accuracy: 96.63% || Valid Accuracy: 94.49%
Epoch Elapsed Time: 4.23 min
Total Training Time: 4.23 min
