In [5]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 29.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 69.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 73.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [44]:
NUM_LABELS = 71

In [45]:
import abc


class LabelTracker(metaclass=abc.ABCMeta):

    @classmethod
    def __subclasshook__(cls, subclass):
        return (hasattr(subclass, 'get_intent_index') and callable(subclass.get_intent_index) or NotImplemented)

    @abc.abstractmethod
    def get_intent_index(self, language: str) -> int:
        raise NotImplementedError


class DictLabelTracker(LabelTracker):
    """A container for labels with lazy registration"""

    def __init__(self):
        self.intent_index = 0
        self.intents = {}

    def get_intent_index(self, intent):
        if intent not in self.intents.keys():
            self.intents[intent] = self.intent_index
            self.intent_index += 1
        return self.intents[intent]


In [59]:
import yaml
import random
from torch.utils.data.dataset import IterableDataset


class HelloEvolweDataset(IterableDataset):
    def __init__(self, filename: str, label_tracker: LabelTracker, shuffle=True):
        super(HelloEvolweDataset, self).__init__()
        self.label_tracker = label_tracker
        self.filename = filename
        self.samples = self._load()
        if shuffle:
            random.shuffle(self.samples)


    def __iter__(self):
        for i, row in enumerate(self.samples):
            yield {
                # "id": i,
                "text": row[0],
                # "intent": row[1],
                "intent_idx": self.label_tracker.get_intent_index(row[1])
            }

    def _load(self):
        samples = []
        with open(self.filename, 'r') as file:
            documents = yaml.full_load(file)
            for entry in documents['data']:
                intent = entry['intent']
                for example in entry['examples']:
                    samples.append((example, intent))
        return samples

    def __len__(self):
      return len(self.samples)


In [47]:
import argparse
from datetime import datetime

import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification

In [52]:
def train(args, model, tokenizer, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, sample in enumerate(train_loader):
        optimizer.zero_grad()

        labels = sample['intent_idx'].unsqueeze(0).to(device)

        texts = sample['text']
        encoded_input = tokenizer.batch_encode_plus(
            batch_text_or_text_pairs=texts,
            add_special_tokens=True,
            padding='max_length',
            max_length=512,
            return_attention_mask=True,
            return_tensors='pt'
        ).to(device)

        outputs = model(**encoded_input, labels=labels)
        loss, logits = outputs[:2]

        loss.backward()
        optimizer.step()

        if batch_idx % args.log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.12f}'.format(
                epoch, batch_idx * len(texts), len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.item()))
            if args.dry_run:
                break

In [49]:
class Struct:
    def __init__(self, **entries):
        self.__dict__.update(entries)

In [73]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [74]:
# training settings
args = Struct(**{
    'batch_size': 15,
    'epochs': 20,
    'lr': 0.0001,
    'log_interval': 10,
    'dry_run': False,
    'snapshot_interval': 50
})

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(f"INFO: Using {device} device")

train_kwargs = {'batch_size': args.batch_size, 'shuffle': False}
if use_cuda:
    train_kwargs.update({'num_workers': 0, 'pin_memory': True})


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=NUM_LABELS,
    output_attentions=False,
    output_hidden_states=False
).to(device)
# print(model)

# weight_decay here means L2 regularization, s. https://stackoverflow.com/questions/42704283/adding-l1-l2-regularization-in-pytorch
optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=1e-5)

train_dataset = HelloEvolweDataset(
    filename='data/hello_nova_intents_0.2.2.yaml',
    label_tracker=DictLabelTracker(),
    shuffle=True
)
train_loader = DataLoader(train_dataset, **train_kwargs)

for epoch in range(1, args.epochs + 1):
    train(args, model, tokenizer, device, train_loader, optimizer, epoch)
    torch.save(model.state_dict(), 'snapshots/' + datetime.now().strftime("%d-%m-%Y_%H:%M:%S") + '.pth')
    # test(model, device, test_loader)


INFO: Using cuda device


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at



In [76]:
!ls -laFh snapshots

total 8.6G
drwxr-xr-x 3 root root 4.0K Sep  2 18:34 ./
drwxr-xr-x 1 root root 4.0K Sep  2 17:33 ../
-rw-r--r-- 1 root root 418M Sep  2 18:17 02-09-2022_18:17:55.pth
-rw-r--r-- 1 root root 418M Sep  2 18:19 02-09-2022_18:19:27.pth
-rw-r--r-- 1 root root 418M Sep  2 18:20 02-09-2022_18:20:14.pth
-rw-r--r-- 1 root root 418M Sep  2 18:21 02-09-2022_18:21:00.pth
-rw-r--r-- 1 root root 418M Sep  2 18:21 02-09-2022_18:21:47.pth
-rw-r--r-- 1 root root 418M Sep  2 18:22 02-09-2022_18:22:34.pth
-rw-r--r-- 1 root root 418M Sep  2 18:23 02-09-2022_18:23:20.pth
-rw-r--r-- 1 root root 418M Sep  2 18:24 02-09-2022_18:24:07.pth
-rw-r--r-- 1 root root 418M Sep  2 18:24 02-09-2022_18:24:53.pth
-rw-r--r-- 1 root root 418M Sep  2 18:25 02-09-2022_18:25:40.pth
-rw-r--r-- 1 root root 418M Sep  2 18:26 02-09-2022_18:26:26.pth
-rw-r--r-- 1 root root 418M Sep  2 18:27 02-09-2022_18:27:13.pth
-rw-r--r-- 1 root root 418M Sep  2 18:28 02-09-2022_18:27:59.pth
-rw-r--r-- 1 root root 418M Sep  2 18:28 02-09-2022_18: