## Validate GPU is available for use

In [1]:
!nvidia-smi

Tue Dec 15 01:26:07 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 418.87.01    Driver Version: 418.87.01    CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

In [2]:
import torch
torch.cuda.is_available()

True

## Install necessary packages

In [3]:
#!pip install -U adapter-transformers

In [4]:
#!pip install datasets

## Load and inspect data

In [5]:
import datasets
import torch
from torch.utils.data import DataLoader, TensorDataset


def get_dataset(dataset):
    ds = datasets.load_dataset('glue', dataset)
    num_classes = ds['train'].features['label'].num_classes
    return ds, num_classes


def create_dataset_from_text_dataset(ds, tokenizer):
    encoding = tokenizer(ds['sentence'], return_tensors='pt', padding=True, truncation=True)
    input_ids = encoding['input_ids']
    attn_masks = encoding['attention_mask']
    labels = torch.tensor(ds['label'])
    return TensorDataset(input_ids, attn_masks, labels)


def get_tensor_datasets(dataset_dict, splits, tokenizer):
    split_datasets = {}
    for s in splits:
        split_datasets[s] = create_dataset_from_text_dataset(dataset_dict[s], tokenizer)
    return split_datasets


def get_data_loaders(split_datasets, batch_size):
    train_loader = DataLoader(split_datasets['train'], batch_size, shuffle=True)
    val_loader = DataLoader(split_datasets['validation'], batch_size, shuffle=False)
    test_loader = DataLoader(split_datasets['test'], batch_size, shuffle=False)
    return train_loader, val_loader, test_loader

In [6]:
# Load dataset
dataset = 'sst2'
print(f'Loading {dataset} dataset...')
dataset_dict, num_classes = get_dataset(dataset)

Loading sst2 dataset...


Reusing dataset glue (/home/jupyter/.cache/huggingface/datasets/glue/sst2/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)


In [7]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

## Load Tokenizer

In [8]:
import torch
import transformers
from transformers import AdapterType
from transformers import BertTokenizerFast, BertForSequenceClassification


def get_tokenizer(model_name):
    if model_name == 'bert-base-uncased':
        tokenizer = BertTokenizerFast.from_pretrained(model_name)
    else:
        raise NotImplementedError

    return tokenizer


def get_transformer(model_name, num_labels, adapter, dataset):
    if model_name == 'bert-base-uncased':
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        if adapter:
            model.add_adapter(dataset, AdapterType.text_task)
            model.train_adapter(dataset)
    else:
        raise NotImplementedError

    return model


def get_criterion(num_labels):
    if num_labels == 2:
        criterion = torch.nn.CrossEntropyLoss()
    else:
        raise NotImplementedError

    return criterion

In [9]:
# Load tokenizer
model_name = 'bert-base-uncased'
print(f'Loading tokenizer for {model_name}...')
tokenizer = get_tokenizer(model_name)

Loading tokenizer for bert-base-uncased...


## Create data loader for various splits

In [10]:
# Create data loader for each split
splits = list(dataset_dict.keys())
print(f'Creating data loader for {splits} splits...')
split_datasets = get_tensor_datasets(dataset_dict, splits, tokenizer)
train_loader, val_loader, test_loader = get_data_loaders(split_datasets, batch_size=32)

Creating data loader for ['train', 'validation', 'test'] splits...


In [11]:
# for i_batch, sample_batched in enumerate(train_loader):
#     print(i_batch, sample_batched[2].size())
    
#     if i_batch == 0:
#         print(sample_batched[2])
#         sb = sample_batched[2].to('cuda')
#         print(sb)
#         break

In [12]:
# Validate data loader
# sample_loader = DataLoader(split_datasets['train'], batch_size=3, shuffle=True)
# for i in sample_loader:
#     input_ids, attn_masks, labels = i
#     decoded = tokenizer.batch_decode(input_ids)
#     for d in decoded:
#         print(d)
#     break

## Create model

In [13]:
# Load model
adapter = True
print(f'Loading {model_name} with adapters={adapter}...')
model = get_transformer(model_name,
                        num_labels=num_classes,
                        adapter=adapter,
                        dataset=dataset)
criterion = get_criterion(num_labels=num_classes)

Loading bert-base-uncased with adapters=True...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Get Learning Scheme

In [14]:
import torch


def get_learning_scheme(learning_scheme, model, learning_rate, adapter, epoch):
    if learning_scheme == 'differential':
        optimizer_grouped_parameters = differential_learning_scheme(model, learning_rate)
        optimizer = torch.optim.SGD(optimizer_grouped_parameters)
    elif learning_scheme == 'fixed':
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    elif learning_scheme == 'nesterov':
        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, nesterov=True)
    elif learning_scheme == 'gradual-unfreeze':
        optimizer_grouped_parameters = gradual_unfreezing_learning_scheme(model, learning_rate, adapter, epoch)
        optimizer = torch.optim.SGD(optimizer_grouped_parameters)
    else:
        raise NotImplementedError

    return optimizer


def differential_learning_scheme(model, learning_rate=0.1, divisor=2.6):
    param_prefixes = {}
    for n, p in model.named_parameters():
        if p.requires_grad:
            base = n.partition('.weight')[0].partition('.bias')[0]
            if base not in param_prefixes:
                param_prefixes[base] = 0

    param_prefix_divisors = list(reversed([divisor * i for i in range(1, len(param_prefixes))])) + [1]
    param_learning_rates = [learning_rate / ld for ld in param_prefix_divisors]

    param_prefix_lr_lookup = dict(zip(param_prefixes.keys(), param_learning_rates))

    optimizer_grouped_parameters = [
        {'params': p, 'lr': param_prefix_lr_lookup[n.partition('.weight')[0].partition('.bias')[0]]}
        for n, p in model.named_parameters() if p.requires_grad
    ]

    return optimizer_grouped_parameters


def gradual_unfreezing_learning_scheme(model, learning_rate, adapter, epoch=1):
    trainable_layers = []
    for n, p in model.named_parameters():
        if p.requires_grad:
            base = n.partition('.weight')[0].partition('.bias')[0]
            if adapter:
                if base not in trainable_layers and 'adapter' or 'classifier' in base:
                    trainable_layers.append(base)
            else:
                if base not in trainable_layers:
                    trainable_layers.append(base)

    optimizer_grouped_parameters = [
        {'params': p, 'lr': learning_rate}
        for n, p in model.named_parameters() if p.requires_grad and n.partition('.weight')[0].partition('.bias')[0] in trainable_layers[-epoch:]
    ]

    return optimizer_grouped_parameters


def get_scheduler(scheduler, optimizer, learning_rate, max_lr):
    if scheduler:
        if scheduler == 'cyclic-triangular':
            scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer,
                                                          base_lr=learning_rate,
                                                          max_lr=max_lr,
                                                          mode='triangular')
        else:
            raise NotImplementedError

    return scheduler

In [15]:
# Get learning scheme
learning_scheme = 'differential'
print(f'Configuring {learning_scheme} learning scheme...')
optimizer = get_learning_scheme(learning_scheme, model, learning_rate=0.01, adapter=adapter, epoch=0)

Configuring differential learning scheme...


## Train

In [16]:
import sys
sys.path.append('/home/jupyter/coms6998-project/trainer')

In [17]:
import pandas as pd
import time
import torch
import torch.nn.functional as F
from utils.learning_scheme import get_learning_scheme


class Trainer:
    def __init__(self, model, n_epochs, optimizer, scheduler, criterion, learning_scheme, learning_rate, adapter):
        self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
        self.model = model.to(self.device)
        self.n_epochs = n_epochs
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.criterion = criterion.to(self.device)
        self.learning_scheme = learning_scheme
        self.learning_rate = learning_rate
        self.adapter = adapter

    def measure_performance(self, loader):
        running_loss = 0.0
        correct_count = 0.0
        total_count = 0.0
        for data in loader:
            input_ids = data[0].to(self.device)
            attn_masks = data[1].to(self.device)
            labels = data[2].to(self.device)
            with torch.no_grad():
                outputs = self.model(input_ids=input_ids, attention_mask=attn_masks)[0]
                loss = self.criterion(outputs, labels)
                probas = F.softmax(outputs, dim=1)
                preds = torch.argmax(probas, axis=1)

                # Track stats
                running_loss += loss
                correct_count += torch.sum(preds == labels)
                total_count += len(labels)

        running_loss /= len(loader)
        acc = correct_count / total_count

        return running_loss, acc

    def train_loop(self, train_loader, val_loader, batch_logging=10):
        print('Starting training loop')

        print('Initial evaluating on validation dataset')
        train_loss, train_acc = self.measure_performance(train_loader)
        val_loss, val_acc = self.measure_performance(val_loader)
        epoch_summary = f'[Epoch 0] | Train acc: {train_acc:.4f} Train loss: {train_loss:.4f} Val acc: {val_acc:.4f} Val loss: {val_loss:.4f}'
        print(epoch_summary)

        epoch_history = [{'epoch': 0,
                          'train loss': train_loss.item(),
                          'train accuracy': train_acc.item(),
                          'validation loss': val_loss.item(),
                          'validation accuracy': val_acc.item(),
                          'epoch time': 0}]
        batch_history = [{'epoch': 0,
                          'batch': 0,
                          'train loss': train_loss.item(),
                          'train accuracy': train_acc.item(),
                          'validation loss': val_loss.item(),
                          'validation accuracy': val_acc.item(),
                          'batch time': 0}]

        for epoch in range(self.n_epochs):

            if self.learning_scheme == 'gradual-unfreeze':
                self.optimizer = get_learning_scheme(self.learning_scheme,
                                                     self.model,
                                                     self.learning_rate,
                                                     self.adapter,
                                                     epoch+1)

            print(f'--- Epoch: {epoch+1} ---')
            epoch_start_time = time.time()
            batch_start_time = time.time()
            running_loss = 0.0
            running_acc = 0.0
            total_count = 0.0

            for i, data in enumerate(train_loader):
                input_ids = data[0].to(self.device)
                attn_masks = data[1].to(self.device)
                labels = data[2].to(self.device)

                self.optimizer.zero_grad()

                # Evaluation/optimization step
                outputs = self.model(input_ids=input_ids, attention_mask=attn_masks)[0]
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()
                if self.scheduler:
                    self.scheduler.step()

                running_loss += loss.item()
                probas = F.softmax(outputs, dim=1)
                preds = torch.argmax(probas, axis=1)
                running_acc += torch.sum(preds == labels).item()
                total_count += len(labels)

                # Print/log statistics periodically
                if i % batch_logging == batch_logging - 1:
                    batch_end_time = time.time()
                    total_batch_time = batch_end_time - batch_start_time
                    batch_loss = running_loss / batch_logging
                    batch_acc = running_acc / total_count
                    batch_val_loss, batch_val_acc = self.measure_performance(val_loader)

                    batch_history.append({'epoch': epoch+1,
                                          'batch': i + 1,
                                          'train loss': batch_loss,
                                          'train accuracy': batch_acc,
                                          'validation loss': batch_val_loss.item(),
                                          'validation accuracy': batch_val_acc.item(),
                                          'batch time': total_batch_time})

                    print(
                        f'[E{epoch + 1:d} B{i + 1:d}] ',
                        f'Loss: {batch_loss:.5f} ',
                        f'Acc: {batch_acc} ',
                        f'Time: {total_batch_time:.2f} ',
                        f'LR: {self.scheduler.get_last_lr()}' if self.scheduler else '')

                    # Reset statistics
                    batch_start_time = time.time()
                    running_loss = 0.0
                    running_acc = 0.0
                    total_count = 0.0

            epoch_end_time = time.time()
            total_epoch_time = epoch_end_time - epoch_start_time
            train_loss, train_acc = self.measure_performance(train_loader)
            val_loss, val_acc = self.measure_performance(val_loader)
            epoch_summary = f'[Epoch {epoch + 1}] {total_epoch_time:.2f} seconds'
            epoch_summary += f' | Train acc: {train_acc:.4f} Train loss: {train_loss:.4f} Val acc: {val_acc:.4f} Val loss: {val_loss:.4f}'

            epoch_history.append({'epoch': epoch + 1,
                                  'train loss': train_loss.item(),
                                  'train accuracy': train_acc.item(),
                                  'validation loss': val_loss.item(),
                                  'validation accuracy': val_acc.item(),
                                  'epoch time': total_epoch_time})

            print(epoch_summary)

        print('Finished training')

        return pd.DataFrame(epoch_history), pd.DataFrame(batch_history)

In [18]:
trainer = Trainer(model=model,
                  n_epochs=5,
                  optimizer=optimizer,
                  scheduler=None,
                  criterion=criterion,
                  learning_scheme=learning_scheme,
                  learning_rate=0.01,
                  adapter=adapter)

In [19]:
sample_loader = DataLoader(split_datasets['train'], batch_size=5, shuffle=False)
for i in sample_loader:
    input_ids, attn_masks, labels = i
    decoded = tokenizer.batch_decode(input_ids)
    for j, d in enumerate(decoded):
        print(f'--- Sample {j+1} ---')
        print(d)
    break

--- Sample 1 ---
[CLS] hide new secretions from the parental units [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
--- Sample 2 ---
[CLS] contains no wit, only labored gags [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
--- Sample 3 ---
[CLS] that loves its characters and communicates something rather beautiful about human nature [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [

In [None]:
trainer.train_loop(train_loader, val_loader, batch_logging=100)

Starting training loop
Initial evaluating on validation dataset
[Epoch 0] | Train acc: 0.5577 Train loss: 0.6943 Val acc: 0.5092 Val loss: 0.7242
--- Epoch: 1 ---
[E1 B100]  Loss: 0.69384  Acc: 0.5421875  Time: 13.78  
[E1 B200]  Loss: 0.68137  Acc: 0.5746875  Time: 13.78  
[E1 B300]  Loss: 0.66610  Acc: 0.6025  Time: 13.79  
[E1 B400]  Loss: 0.64029  Acc: 0.6303125  Time: 13.78  
[E1 B500]  Loss: 0.63395  Acc: 0.648125  Time: 13.77  
[E1 B600]  Loss: 0.62413  Acc: 0.6459375  Time: 13.78  
[E1 B700]  Loss: 0.60315  Acc: 0.6796875  Time: 13.77  
[E1 B800]  Loss: 0.60733  Acc: 0.6825  Time: 13.78  
[E1 B900]  Loss: 0.59183  Acc: 0.7009375  Time: 13.78  
[E1 B1000]  Loss: 0.57802  Acc: 0.71  Time: 13.77  
[E1 B1100]  Loss: 0.58295  Acc: 0.700625  Time: 13.77  
[E1 B1200]  Loss: 0.56639  Acc: 0.7265625  Time: 13.77  
[E1 B1300]  Loss: 0.55560  Acc: 0.74125  Time: 13.77  
[E1 B1400]  Loss: 0.55803  Acc: 0.7303125  Time: 13.77  
[E1 B1500]  Loss: 0.55543  Acc: 0.7265625  Time: 13.76  
[E1 B1

In [25]:
sample_loader = DataLoader(split_datasets['train'], batch_size=5, shuffle=True)
for i in sample_loader:
    input_ids, attn_masks, labels = i
    decoded = tokenizer.batch_decode(input_ids)
    for j, d in enumerate(decoded):
        print(f'--- Sample {j+1} ---')
        print(d)
    print(labels)
    input_ids = input_ids.to(trainer.device)
    attn_masks = attn_masks.to(trainer.device)
    labels = labels.to(trainer.device)
    outputs = trainer.model(input_ids=input_ids, attention_mask=attn_masks)[0]
    probas = F.softmax(outputs, dim=1)
    preds = torch.argmax(probas, axis=1)
    print(preds)
    break

--- Sample 1 ---
[CLS] i found myself growing more and more frustrated and detached as vincent became more and more abhorrent. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
--- Sample 2 ---
[CLS] can get your money back [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
--- Sample 3 ---
[CLS] to the climactic burst of violence [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 