In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 68.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 48.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2


In [76]:
NUM_LABELS = 67

In [77]:
class LabelTracker:
    """A container for labels with lazy registration"""

    def __init__(self):
        self.label_idx = 0
        self.labels = {}

    def get_intent_index(self, label):
        if label not in self.labels.keys():
            self.labels[label] = self.label_idx
            self.label_idx += 1
        return self.labels[label]

    def get_num_labels(self):
        return len(self.labels)

In [78]:
from typing import List, Tuple

import csv
from torch.utils.data.dataset import Dataset


class HelloEvolweDataset(Dataset):
    def __init__(self, filename: str, label_tracker: LabelTracker):
        super(HelloEvolweDataset, self).__init__()
        self.label_tracker = label_tracker
        self.filename = filename
        self.samples = self._load()

    def __getitem__(self, idx):
        sample = self.samples[idx]
        return {
            "text": sample[0],
            "intent_idx": sample[2]
        }

    def __len__(self) -> int:
        return len(self.samples)

    def get_class_weights(self):
        n_classes = self.label_tracker.get_num_labels()
        n_samples = [0 for _ in range(n_classes)]
        for sample in self.samples:
            i = self.label_tracker.get_intent_index(sample[1])
            n_samples[i] += 1
        weights = [count / n_classes for count in n_samples]
        return weights

    def _load(self) -> List[Tuple[str, str, int]]:
        samples = []
        with open(self.filename, 'r') as f:
            reader = csv.DictReader(f)
            for entry in reader:
                samples.append((
                    entry['text'],
                    entry['intent'],
                    self.label_tracker.get_intent_index(entry['intent'])
                ))
        return samples

In [86]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from datetime import datetime

In [87]:
def train(args, model, tokenizer, device, train_loader, optimizer, scheduler, epoch, class_weights):
    model.train()
    class_weights = torch.tensor(class_weights).to(device)

    for batch_idx, sample in enumerate(train_loader):
        optimizer.zero_grad()

        labels = sample['intent_idx'].to(device)

        texts = sample['text']
        encoded_input = tokenizer.batch_encode_plus(
            batch_text_or_text_pairs=texts,
            add_special_tokens=True,
            padding='max_length',
            max_length=512,
            return_attention_mask=True,
            return_tensors='pt'
        ).to(device)

        outputs = model(**encoded_input)
        logits = outputs['logits']

        criterion = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = criterion(logits, labels)

        loss.backward()
        
        optimizer.step()
        scheduler.step()

        if batch_idx % args['log_interval'] == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.12f}'.format(
                epoch, batch_idx * len(texts), len(train_loader.dataset),
                       100. * batch_idx / len(train_loader), loss.item()))
            if args['dry_run']:
                break

In [88]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [None]:
# training settings
args = {
    'batch_size': 15,
    'epochs': 20,
    'lr': 5e-5,
    'log_interval': 10,
    'dry_run': False,
    'snapshot_interval': 100
}

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(f"INFO: Using {device} device")

train_kwargs = {'batch_size': args['batch_size'], 'shuffle': True}
if use_cuda:
    train_kwargs.update({'num_workers': 0, 'pin_memory': True})


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=NUM_LABELS,
    output_attentions=False,
    output_hidden_states=False
).to(device)
# print(model)

# weight_decay here means L2 regularization, s. https://stackoverflow.com/questions/42704283/adding-l1-l2-regularization-in-pytorch
optimizer = AdamW(model.parameters(), lr=args['lr'], eps=1e-8, weight_decay=1e-4)

train_dataset = HelloEvolweDataset(
    filename='data/train.csv',
    label_tracker=LabelTracker()
)
train_loader = DataLoader(train_dataset, **train_kwargs)
class_weights = train_dataset.get_class_weights()

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=args['epochs'])

# start where we ended last time
# model.load_state_dict(torch.load('/content/snapshots/02-09-2022_19:01:31.pth'))

for epoch in range(1, args['epochs'] + 1):
    train(args, model, tokenizer, device, train_loader, optimizer, scheduler, epoch, class_weights)
    torch.save(model.state_dict(), 'snapshots/' + datetime.now().strftime("%d-%m-%Y_%H:%M:%S") + '.pth')
    # test(model, device, test_loader)


INFO: Using cuda device


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at



In [None]:
!ls -laFh snapshots

total 8.6G
drwxr-xr-x 3 root root 4.0K Sep  2 18:34 ./
drwxr-xr-x 1 root root 4.0K Sep  2 17:33 ../
-rw-r--r-- 1 root root 418M Sep  2 18:17 02-09-2022_18:17:55.pth
-rw-r--r-- 1 root root 418M Sep  2 18:19 02-09-2022_18:19:27.pth
-rw-r--r-- 1 root root 418M Sep  2 18:20 02-09-2022_18:20:14.pth
-rw-r--r-- 1 root root 418M Sep  2 18:21 02-09-2022_18:21:00.pth
-rw-r--r-- 1 root root 418M Sep  2 18:21 02-09-2022_18:21:47.pth
-rw-r--r-- 1 root root 418M Sep  2 18:22 02-09-2022_18:22:34.pth
-rw-r--r-- 1 root root 418M Sep  2 18:23 02-09-2022_18:23:20.pth
-rw-r--r-- 1 root root 418M Sep  2 18:24 02-09-2022_18:24:07.pth
-rw-r--r-- 1 root root 418M Sep  2 18:24 02-09-2022_18:24:53.pth
-rw-r--r-- 1 root root 418M Sep  2 18:25 02-09-2022_18:25:40.pth
-rw-r--r-- 1 root root 418M Sep  2 18:26 02-09-2022_18:26:26.pth
-rw-r--r-- 1 root root 418M Sep  2 18:27 02-09-2022_18:27:13.pth
-rw-r--r-- 1 root root 418M Sep  2 18:28 02-09-2022_18:27:59.pth
-rw-r--r-- 1 root root 418M Sep  2 18:28 02-09-2022_18:

In [16]:
!mv /content/snapshots/03-09-2022_06:53:41.pth /content/drive/MyDrive/Colab\ Snapshots/evelowe_test_assignment