<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/misc/test_sample_DistilBERT_PT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

In [3]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d datasnaek/mbti-type

In [6]:
! unzip mbti-type.zip

Archive:  mbti-type.zip
  inflating: mbti_1.csv              


In [None]:
! pip install transformers

In [26]:
import time
import pandas as pd
import re
import random

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

In [2]:
dataset = pd.read_csv('mbti_1.csv')
dataset.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [3]:
posts = dataset['posts']
types = dataset['type']
i_dataset = []
e_dataset = []
for type_p, post in zip(types, posts):
  post = post.split('|||')
  for text in post:
    text = re.sub(r'http\S+', '', text)
    text = ' '.join(text.split()) # remove extra whitespaces from sentences
    if len(text.split()) >= 3:
      if "I" in type_p:
        i_dataset.append((text.lower(), "INTROVERT"))
      else:
        e_dataset.append((text.lower(), "EXTROVERT"))

In [4]:
# The dataset is skewed towards introvert samples
print("Original sample lengths:")
print(len(i_dataset), len(e_dataset))

# Shuffling both introvert samples and extrovert samples
random.shuffle(i_dataset)
random.shuffle(e_dataset)

# Taking min of both samples
i_dataset = i_dataset[:min(len(i_dataset), len(e_dataset))]
print('Reduced lengths:')
print(len(i_dataset), len(e_dataset))

Original sample lengths:
304390 92425
Reduced lengths:
92425 92425


In [5]:
final_dataset = []
final_dataset.extend(i_dataset)
final_dataset.extend(e_dataset)

In [7]:
# Shuffling final dataset
random.shuffle(final_dataset)

In [22]:
limit = 90 * len(final_dataset) // 100
train_list = final_dataset[:limit]
valid_list = final_dataset[limit:]
print('Length of train samples: ', len(train_list))
print('Length of valid samples: ', len(valid_list))

Length of train samples:  166365
Length of valid samples:  18485


In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)

params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Total trainable parameters: ', params)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

Total trainable parameters:  66955010


In [66]:
class PersonalityDataset(Dataset):
  def __init__(self, tokenizer, list_text, max_length=64):
    self.tokenizer = tokenizer
    self.list_text = list_text
    self.max_length = max_length
    self.samples = []
    self.build()
  
  def __len__(self):
    return len(self.samples)
  
  def __getitem__(self, idx):
    text = self.samples[idx]['text']
    target = self.samples[idx]['target']
    tokens = self.tokenizer(text, max_length=self.max_length, padding='max_length', return_tensors='pt')
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    return {
        'ids': input_ids,
        'mask': attention_mask,
        'tgt': torch.tensor(target),
    }

  def build(self):
    for sample in self.list_text:
      text = sample[0]
      p_type = 1 if sample[1] is "INTROVERT" else 0
      self.samples.append({
          'text': text,
          'target': p_type,
      })

In [67]:
train_dataset = PersonalityDataset(tokenizer, train_list, max_length=64)
valid_dataset = PersonalityDataset(tokenizer, valid_list, max_length=64)

Elapsed Time: 0.01 min


In [68]:
BATCH_SIZE = 32
LEARNING_RATE = 3e-5
EPOCHS = 10

In [69]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4)
valid_loader = DataLoader(dataset=valid_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

# Check train loader
for sample in train_loader:
  ids = sample['ids']
  mask = sample['mask']
  tgt = sample['tgt'].unsqueeze(0)
  print(ids.shape, mask.shape, tgt.shape)
  break

torch.Size([32, 1, 64]) torch.Size([32, 1, 64]) torch.Size([1, 32])


In [41]:
print('Length of train_loader: ', len(train_loader))
print('Length of valid_loader: ', len(valid_loader))

Length of train_loader:  5199
Length of valid_loader:  578


In [None]:
def compute_accuracy(model, data_loader, device):
  correct_pred, num_examples = 0, 0
  for sample in data_loader:
    ids = sample['ids'].to(device)
    mask = sample['mask'].to(device)
    tgt = sample['mask'].to(device)
    outputs = model(input_ids=ids, attention_mask=mask)
    logits = outputs.logits
    probas = F.softmax(logits, dim=1)
    _, predicted_labels = torch.max(probas, 1)
    pass

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for idx, sample in enumerate(train_loader):
    ids = sample['ids'].to(device)
    mask = sample['mask'].to(device)
    tgt = sample['tgt'].unsqueeze(0).to(device)

    outputs = model(input_ids=ids, attention_mask=mask, labels=tgt)
    loss = outputs.loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # LOGGING
    if idx % 1000 == 0:
      print('Batch: %04d/%04d || Epoch: %04d/%04d || Loss: %.2f' % (idx, len(train_loader), epoch+1, EPOCHS, loss.item()))
  
  model.eval()
  with torch.set_grad_enabled(False):
    train_acc = compute_accuracy(model, train_loader, device)
    valid_acc = compute_accuracy(model, valid_loader, device)
    print('Train Accuracy: %.2f%%' % (train_acc))
    print('Valid Accuracy: %.2f%%' % (valid_acc))
  epoch_elapsed_time = (time.time() - start_time) / 60
  print('Epoch Elapsed Time: %.2f min' % (epoch_elapsed_time))
total_training_time = (time.time() - start_time) / 60
print('Total Training Time: %.2f min' % (total_training_time))