<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/misc/test_HuggingFace(BenchmarkClassification).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! nvidia-smi

In [None]:
! pip install transformers
! pip install datasets
! pip install sentencepiece

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
from transformers import BertForSequenceClassification, BertTokenizer

In [2]:
dataset = load_dataset("tweets_hate_speech_detection")

Using custom data configuration default
Reusing dataset tweets_hate_speech_detection (/root/.cache/huggingface/datasets/tweets_hate_speech_detection/default/0.0.0/3e953745870454cf8ff15cc48097dbb5ff459596e0a089867c2a29cee63984ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [3]:
tweets = []
labels = []
for sample in dataset["train"]:
  tweets.append(sample["tweet"])
  labels.append(sample["label"])
assert len(tweets) == len(labels)

In [4]:
split = 90 * len(tweets) // 100
train_tweets = tweets[:split]
valid_tweets = tweets[split:]

train_labels = labels[:split]
valid_labels = labels[split:]
print(f"Train Samples --> {len(train_tweets)}")
print(f"Valid Samples --> {len(valid_tweets)}")

Train Samples --> 28765
Valid Samples --> 3197


In [5]:
class CustomTCDataset(Dataset):
    def __init__(self, tokenizer, input_texts, target_labels, max_input_length=16):
        self.tokenizer = tokenizer
        self.input_texts = input_texts
        self.target_labels = target_labels
        assert len(self.input_texts) == len(self.target_labels), 'Input and Target labels sizes do not match'
        self.max_input_length = max_input_length

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        input_sentences = self.input_texts[idx]
        target_labels = self.target_labels[idx]
        return {
                'sents': input_sentences,
                'labels': target_labels,
                }

    def collate_fn(self, batch):
        sents = []
        labels = []
        for sample in batch:
            sents.append(sample['sents'])
            labels.append(sample['labels'])
        tokens_input = self.tokenizer(sents,
                max_length=self.max_input_length,
                padding=True,
                truncation=True,
                return_tensors='pt'
                )
        return {
                'input_ids': tokens_input['input_ids'],
                'attention_mask': tokens_input['attention_mask'],
                'labels': torch.tensor(labels, dtype=torch.long),
                }

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
model.to(device)

params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable Parameters --> {params}")

In [7]:
BS = 128
INPUT_LENGTH = 128

In [8]:
train_dataset = CustomTCDataset(tokenizer=tokenizer,
                                       input_texts=train_tweets,
                                       target_labels=train_labels,
                                       max_input_length=INPUT_LENGTH,
                                       )

valid_dataset = CustomTCDataset(tokenizer=tokenizer,
                                       input_texts=valid_tweets,
                                       target_labels=valid_labels,
                                       max_input_length=INPUT_LENGTH,
                                       )

train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, collate_fn=train_dataset.collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=BS, shuffle=False, collate_fn=valid_dataset.collate_fn)

print('Length of Train Loader: ', len(train_loader))
print('Length of Valid Loader: ', len(valid_loader))

Length of Train Loader:  225
Length of Valid Loader:  25


In [9]:
# For AdamW
training_args = TrainingArguments(
    output_dir='./results',         # output directory
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=BS,  # batch size per device during training
    per_device_eval_batch_size=BS,               # strength of weight decay
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='no',
    fp16=False,
    learning_rate=5e-5,
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=train_dataset.collate_fn,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [None]:
trainer.train()