In [1]:
from datasets import load_dataset
import pandas as pd
import numpy as np
import requests
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
from torch.nn.utils import clip_grad_norm_
from tqdm.notebook import tqdm

2023-04-11 20:50:19.679491: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-11 20:50:21.431319: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-04-11 20:50:21.431500: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [3]:
tokeniser = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [2]:
def encode(corpus):
    encoded = tokeniser.batch_encode_plus(corpus, max_length=128,
                                         add_special_tokens=True,
                                         return_attention_mask=True,
                                         truncation=True,
                                         return_tensors='pt',
                                         padding='max_length')
    ids = encoded['input_ids']
    masks = encoded['attention_mask']
    return ids, masks


def get_dataloader(ids, masks, data):
    tensored = TensorDataset(ids, masks, data)
    sampler = RandomSampler(tensored)
    dataloader = DataLoader(tensored, sampler=sampler, batch_size=16)
    return dataloader

In [4]:
# Larger
dataset = load_dataset('dair-ai/emotion', name='unsplit', split='train')

# Smaler
# dataset = load_dataset('dair-ai/emotion', name='split', split='train')


Found cached dataset emotion (/home/zum/.cache/huggingface/datasets/dair-ai___emotion/unsplit/1.0.0/cca5efe2dfeb58c1d098e0f9eeb200e9927d889b5a03c67097275dfb5fe463bd)


In [5]:
dataset.set_format(type='pandas')
N = len(set(dataset['label']))

In [7]:
# Larger
all_data = dataset[:]
all_data
x_train, x_test, y_train, y_test = train_test_split(all_data['text'], all_data['label'], test_size=0.15, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=1)

In [9]:
print(x_train.shape, x_test.shape, x_val.shape)

(318858,) (62522,) (35429,)


In [10]:
train_ids, train_masks = encode(list(x_train))
val_ids, val_masks = encode(list(x_val))
test_ids, test_masks = encode(list(x_test))

In [11]:
y_train = torch.LongTensor(list(y_train))
y_val = torch.LongTensor(list(y_val))
y_test = torch.LongTensor(list(y_test))

In [12]:
train_loader = get_dataloader(train_ids, train_masks, y_train)
val_loader = get_dataloader(val_ids, val_masks, y_val)
test_loader = get_dataloader(test_ids, test_masks, y_test)

In [13]:
classifier = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                          num_labels=N,
                                                          output_attentions=False,
                                                          output_hidden_states=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
device='cpu'
classifier = classifier.cpu()

In [17]:
optimiser = AdamW(classifier.parameters(), lr=1e-4)
scheduler = get_linear_schedule_with_warmup(optimiser,
                                           num_training_steps=len(train_loader) * 30,
                                           num_warmup_steps=0)

In [None]:
train_losses = []

for epoch in range(30):
    classifier.train()
    train_loss = 0
    
    for step, data in enumerate(tqdm(train_loader, desc='Training')):
        ids, masks, labels = [x.to(device) for x in data]
        output = classifier(input_ids=ids, attention_mask=masks, labels=labels)
        loss = output.loss
        train_loss += loss.item()
        
        classifier.zero_grad()
        loss.backward()
        del loss
        
        clip_grad_norm_(parameters=classifier.parameters(), max_norm=1)
        optimiser.step()
        scheduler.step()
    
    train_losses.append(train_loss / (step + 1))
      

Training:   0%|          | 0/19929 [00:00<?, ?it/s]

In [None]:
def save_model(model, path):
    torch.save(model.state_dict(), path)