## Validate GPU is available for use

In [1]:
!nvidia-smi

Wed Dec  9 14:40:04 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.06    Driver Version: 450.51.06    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import torch
torch.cuda.is_available()

True

## Install necessary packages

In [3]:
# ! pip install -U adapter-transformers

In [4]:
# ! pip install datasets

## Load and inspect data

In [5]:
import datasets
sst2 = datasets.load_dataset('glue', 'sst2')
sst2

Reusing dataset glue (/home/jupyter/.cache/huggingface/datasets/glue/sst2/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [6]:
for i in range(5):
    print(sst2['train'][i])

{'idx': 0, 'label': 0, 'sentence': 'hide new secretions from the parental units '}
{'idx': 1, 'label': 0, 'sentence': 'contains no wit , only labored gags '}
{'idx': 2, 'label': 1, 'sentence': 'that loves its characters and communicates something rather beautiful about human nature '}
{'idx': 3, 'label': 0, 'sentence': 'remains utterly satisfied to remain the same throughout '}
{'idx': 4, 'label': 0, 'sentence': 'on the worst revenge-of-the-nerds clichés the filmmakers could dredge up '}


## Load Tokenizer

In [7]:
import transformers
from transformers import BertTokenizerFast, BertForSequenceClassification

model_name = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(model_name)

## Create data loader for various splits

In [8]:
from torch.utils.data import DataLoader, TensorDataset

def create_dataset_from_text_dataset(ds):
    encoding = tokenizer(ds['sentence'], return_tensors='pt', padding=True, truncation=True)
    input_ids = encoding['input_ids']
    attn_masks = encoding['attention_mask']
    labels = torch.tensor(ds['label'])
    
    return TensorDataset(input_ids, attn_masks, labels)

splits = ['train',  'validation', 'test']
split_datasets = {}

for s in splits:
    split_datasets[s] = create_dataset_from_text_dataset(sst2[s])

split_datasets

{'train': <torch.utils.data.dataset.TensorDataset at 0x7f57f1bba4d0>,
 'validation': <torch.utils.data.dataset.TensorDataset at 0x7f57f1bba690>,
 'test': <torch.utils.data.dataset.TensorDataset at 0x7f57f1bba810>}

In [9]:
# Validate data loader
# sample_loader = DataLoader(split_datasets['train'], batch_size=3, shuffle=True)
# for i in sample_loader:
#     input_ids, attn_masks, labels = i
#     decoded = tokenizer.batch_decode(input_ids)
#     for d in decoded:
#         print(d)
#     break

## Create model

In [10]:
def create_model(add_adapters=False):
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
    if add_adapters:
        from transformers import AdapterType
        model.add_adapter("sst2", AdapterType.text_task)
        model.train_adapter("sst2")
    return model

In [11]:
train_loader = DataLoader(split_datasets['train'], batch_size=16, shuffle=True)
val_loader = DataLoader(split_datasets['validation'], batch_size=128, shuffle=False)

In [12]:
from transformers import logging
logging.set_verbosity_warning()

model = create_model(add_adapters=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [13]:
prefixes = {}
for n, p in model.named_parameters():
    base = n.partition('.weight')[0].partition('.bias')[0]
    if base not in prefixes:
        prefixes[base] = 0

In [14]:
BASE_LR = 0.1
BASE_DIVISOR = 2.6

prefix_divisors = list(reversed([BASE_DIVISOR * i for i in range(1, len(prefixes))])) + [1]
layer_learning_rates = [BASE_LR / ld for ld in prefix_divisors]

prefix_lr_lookup = dict(zip(prefixes.keys(), layer_learning_rates))

In [15]:
optimizer_grouped_parameters = [
    {'params': p, 'lr': prefix_lr_lookup[n.partition('.weight')[0].partition('.bias')[0]]}
    for n, p in model.named_parameters()
]

In [16]:
optimizer = torch.optim.SGD(optimizer_grouped_parameters)

In [17]:
import time
import torch.nn.functional as F
import copy

class Trainer:
    def __init__(self, model, n_epochs, optimizer):
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.n_epochs = n_epochs
        self.optimizer = optimizer
        self.criterion = torch.nn.CrossEntropyLoss().to(self.device)

#         no_decay = ['bias', 'LayerNorm.weight']
#         optimizer_grouped_parameters = [
#             {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
#             {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
#         ]
#         self.optimizer = torch.optim.SGD(optimizer_grouped_parameters, lr=1e-3, momentum=0.9)
        # self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer)
        # self.scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(self.optimizer, T_0=10000, T_mult=2)        

    def measure_performance(self, loader):
        running_loss = 0.0
        correct_count = 0
        total_count = 0
        for data in loader:
            input_ids = data[0].to(self.device)
            attn_masks = data[1].to(self.device)
            labels = data[2].to(self.device)
            with torch.no_grad():
                outputs = self.model(input_ids=input_ids, attention_mask=attn_masks)[0]
                loss = self.criterion(outputs, labels)
                probas = F.softmax(outputs, dim=1)
                preds = torch.argmax(probas, axis=1)
                
                # Track stats
                running_loss += loss
                correct_count += torch.sum(preds == labels) 
                total_count += len(labels) 
        
        running_loss /= len(loader)
        acc = correct_count / total_count

        return running_loss, acc

    def train_loop(self, train_loader, val_loader=None):
        print('Starting training loop\n\n')
        N_MINI_BATCH_CHECK = 200

        if val_loader:
            print('Initial evaluating on validation dataset')
            val_loss, val_acc = self.measure_performance(val_loader)
            epoch_summary = f'[Epoch 0] | Val acc: {val_acc:.4f} Val loss: {val_loss:.4f}\n\n'
            print(epoch_summary)

        for epoch in range(self.n_epochs):
            print(f'--- Epoch: {epoch} ---')
            epoch_start_time = time.time()
            batch_start_time = time.time()
            running_loss = 0.0

            for i, data in enumerate(iter(train_loader)):
                input_ids = data[0].to(self.device)
                attn_masks = data[1].to(self.device)
                labels = data[2].to(self.device)

                self.optimizer.zero_grad()

                # Evaluation/optimization step
                outputs = self.model(input_ids=input_ids, attention_mask=attn_masks)[0]
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                if self.scheduler:
                    self.scheduler.step()
                
                # Print statistics periodically
                running_loss += loss.item()
                if i % N_MINI_BATCH_CHECK == N_MINI_BATCH_CHECK - 1:
                    batch_end_time = time.time()
                    total_batch_time = batch_end_time - batch_start_time

                    print(
                        f'[E{epoch + 1:d} B{i + 1:d}] ',
                        f'Loss: {running_loss / N_MINI_BATCH_CHECK:.5f} ',
                        f'Time: {total_batch_time:.2f} ',
                        f'LR: {self.scheduler.get_last_lr()}' if self.scheduler else '')

                    # Reset statistics
                    batch_start_time = time.time()
                    running_loss = 0.0

            epoch_end_time = time.time()
            total_epoch_time = epoch_end_time - epoch_start_time
            epoch_summary = '[Epoch {}] {} seconds'.format((epoch + 1), total_epoch_time)
            
            if val_loader:
                val_loss, val_acc = self.measure_performance(val_loader)
                epoch_summary += f' | Val acc: {val_acc:.4f} | Val loss: {loss:.4f}'

            print(epoch_summary)

        print('Finished training')

    def lr_test(self, train_loader, lrs=(-9, 2)):
        """
        lrs = (min_lr, max_lr, factor_scale)
        """

        # Prepare LR-finder loop
        model = copy.deepcopy(self.model).to(self.device)
        min_lr, max_lr = lrs
        lrs = np.logspace(min_lr, max_lr, num=len(train_loader), endpoint=True)
        losses = []
        for i, data in enumerate(iter(train_loader)):
            curr_lr = lrs[i]
            optimizer = torch.optim.SGD(model.parameters(), lr=curr_lr)

            input_ids = data[0].to(self.device)
            attn_masks = data[1].to(self.device)
            labels = data[2].to(self.device)

            # Evaluation/optimization step
            outputs = model(input_ids=input_ids, attention_mask=attn_masks)[0]
            loss = self.criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            losses.append(loss)

            if i % 100 == 0:
                print(f'Step [{i}, {len(train_loader)}] | LR: {curr_lr:.4e} | Loss: {loss:.4f}')

        return losses

In [18]:
trainer = Trainer(model, 10, optimizer)

In [1]:
trainer.train_loop(train_loader, val_loader)

NameError: name 'trainer' is not defined