In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 69.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [2]:
import numpy as np
import torch
from torch.utils.data import DataLoader
from torch.utils.data import SubsetRandomSampler
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from datetime import datetime

In [3]:
NUM_LABELS = 67

In [4]:
class LabelTracker:
    """A container for labels with lazy registration"""

    def __init__(self):
        self.label_idx = 0
        self.labels = {}

    def get_intent_index(self, label):
        if label not in self.labels.keys():
            self.labels[label] = self.label_idx
            self.label_idx += 1
        return self.labels[label]

    def get_num_labels(self):
        return len(self.labels)

In [6]:
from typing import List, Tuple

import csv
import numpy as np
import torch.utils.data
from torch.utils.data.dataset import Dataset
from torch.utils.data import SubsetRandomSampler


class HelloEvolweDataset(Dataset):
    def __init__(self, filename: str, label_tracker: LabelTracker):
        super(HelloEvolweDataset, self).__init__()
        self.label_tracker = label_tracker
        self.filename = filename
        self.samples = self._load()

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return {
            "text": sample[0],
            "intent_idx": sample[2]
        }

    def __len__(self) -> int:
        return len(self.samples)

    def _load(self) -> List[Tuple[str, str, int]]:
        samples = []
        with open(self.filename, 'r') as f:
            reader = csv.DictReader(f)
            for entry in reader:
                samples.append((
                    entry['text'],
                    entry['intent'],
                    self.label_tracker.get_intent_index(entry['intent'])
                ))
        return samples

In [7]:
def train(args, model, tokenizer, device, train_loader, optimizer, epoch):
    model.train()

    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()

        labels = batch['intent_idx'].to(device)

        texts = batch['text']
        encoded_input = tokenizer.batch_encode_plus(
            batch_text_or_text_pairs=texts,
            add_special_tokens=True,
            padding='max_length',
            max_length=512,
            return_attention_mask=True,
            return_tensors='pt'
        ).to(device)

        outputs = model(**encoded_input)
        logits = outputs['logits']

        criterion = torch.nn.CrossEntropyLoss()
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        if batch_idx % args['log_interval'] == 0:
            print('Train epoch {} ({:.0f}%):\tloss: {:.12f}'.format(
                epoch, 100. * batch_idx / len(train_loader), loss.item())
            )

In [8]:
def evaluate(model, tokenizer, device, test_loader):
    model.eval()

    validation_accuracy = []
    validation_loss = []

    for batch in test_loader:
        labels = batch['intent_idx'].to(device)

        texts = batch['text']
        encoded_input = tokenizer.batch_encode_plus(
            batch_text_or_text_pairs=texts,
            add_special_tokens=True,
            padding='max_length',
            max_length=512,
            return_attention_mask=True,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            outputs = model(**encoded_input)
            logits = outputs['logits']

        criterion = torch.nn.CrossEntropyLoss()
        loss = criterion(logits, labels)
        validation_loss.append(loss.item())

        predictions = torch.argmax(logits, dim=1).flatten()
        accuracy = torch.eq(predictions, labels).cpu().numpy().mean()
        validation_accuracy.append(accuracy)

    return np.mean(validation_loss), np.mean(validation_accuracy)

In [12]:
# torch.cuda.empty_cache()

# training settings
args = {
    'batch_size': 32,
    'epochs': 100,
    'lr': 1e-5,
    'log_interval': 10,
    'snapshot_interval': 100
}

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(f"INFO: Using {device} device")

train_kwargs = {'batch_size': args['batch_size'], 'shuffle': False}
if use_cuda:
    train_kwargs.update({'num_workers': 0, 'pin_memory': True})


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=NUM_LABELS,
    output_attentions=False,
    output_hidden_states=False
).to(device)
# print(model)

# freeze (some) BERT layers to avoid GPU Out-of-Memory error
for name, param in model.named_parameters():
    if name.startswith("bert.embeddings"):
        param.requires_grad = False
    if name.startswith("bert.encoder.layer") and not \
            (name.startswith("bert.encoder.layer.8") or
              name.startswith("bert.encoder.layer.9") or
              name.startswith("bert.encoder.layer.10") or
              name.startswith("bert.encoder.layer.11")):
        param.requires_grad = False

# weight_decay here means L2 regularization, s. https://stackoverflow.com/questions/42704283/adding-l1-l2-regularization-in-pytorch
# also skip frozen parameters
optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=args['lr'], eps=1e-8, weight_decay=1e-4)

label_tracker = LabelTracker()
dataset = HelloEvolweDataset(filename='/content/data/dataset.csv', label_tracker=label_tracker)

# splits
test_split_portion = 0.2
n_samples = len(dataset)
indices = list(range(n_samples))
split_idx = int(np.floor(test_split_portion * n_samples))

# shuffle
random_seed = 42
np.random.seed(random_seed)
np.random.shuffle(indices)

train_indices, test_indices = indices[split_idx:], indices[:split_idx]

# samplers
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)

train_loader = DataLoader(dataset, sampler=train_sampler, **train_kwargs)
test_loader = DataLoader(dataset, sampler=test_sampler, **train_kwargs)

# start where we ended last time
# model.load_state_dict(torch.load('../snapshots/03-09-2022_22:38:04_e149_lr1e-6.pth'))

for epoch in range(1, args['epochs'] + 1):
    train(args, model, tokenizer, device, train_loader, optimizer, epoch)
    torch.save(model.state_dict(), '/content/snapshots/' + datetime.now().strftime("%d-%m-%Y_%H:%M:%S") + '.pth')
    validation_loss, validation_accuracy = evaluate(model, tokenizer, device, test_loader)
    print("Eval. epoch {}:\tloss = {:.12f}, accuracy = {:.4f}".format(epoch, validation_loss, validation_accuracy))



INFO: Using cuda device


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Train epoch 1 (0%):	loss: 4.265549659729
Train epoch 1 (83%):	loss: 4.235233783722
Eval. epoch 1:	loss = 4.217422326406, accuracy = 0.0104
Train epoch 2 (0%):	loss: 4.234727382660
Train epoch 2 (83%):	loss: 4.160099506378
Eval. epoch 2:	loss = 4.199622631073, accuracy = 0.0104
Train epoch 3 (0%):	loss: 4.111648082733
Train epoch 3 (83%):	loss: 4.208429813385
Eval. epoch 3:	loss = 4.187750021617, accuracy = 0.0222
Train epoch 4 (0%):	loss: 4.162680625916
Train epoch 4 (83%):	loss: 4.114163398743
Eval. epoch 4:	loss = 4.180076281230, accuracy = 0.0312
Train epoch 5 (0%):	loss: 4.106890678406
Train epoch 5 (83%):	loss: 4.007476329803
Eval. epoch 5:	loss = 4.160494804382, accuracy = 0.0215
Train epoch 6 (0%):	loss: 4.032822132111
Train epoch 6 (83%):	loss: 4.025674343109
Eval. epoch 6:	loss = 4.140016714732, accuracy = 0.0312
Train epoch 7 (0%):	loss: 3.925719976425
Train epoch 7 (83%):	loss: 4.037945270538
Eval. epoch 7:	loss = 4.093083699544, accuracy = 0.0757
Train epoch 8 (0%):	loss: 3