# Test GPU capabilities and PyTorch version alignment

In [1]:
!nvidia-smi

Fri Dec 11 03:53:41 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.45.01    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   62C    P8    11W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

1.7.0+cu101
True


# Install necessary packages

In [3]:
!pip install -U adapter-transformers

Collecting adapter-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/9d/44/1370c187aba1349d56d6813ec4de54644d15e154983050f4923ce5455069/adapter_transformers-1.1.0-py3-none-any.whl (1.3MB)
[K     |████████████████████████████████| 1.3MB 9.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 25.0MB/s 
Collecting sentencepiece==0.1.91
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 41.0MB/s 
Collecting tokenizers==0.9.3
[?25l  Downloading https://files.pythonhosted.org/packages/4c/34/b39eb9994bc3c999270b69c9eea40ecc6f0e97991dba28282b9fd32d44ee/tokenizers-0.9.3-cp36-cp36m-manylinux1_x86_64.whl (2.

In [4]:
!pip install datasets

Collecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/1a/38/0c24dce24767386123d528d27109024220db0e7a04467b658d587695241a/datasets-1.1.3-py3-none-any.whl (153kB)
[K     |██▏                             | 10kB 25.3MB/s eta 0:00:01[K     |████▎                           | 20kB 13.6MB/s eta 0:00:01[K     |██████▍                         | 30kB 12.1MB/s eta 0:00:01[K     |████████▌                       | 40kB 11.5MB/s eta 0:00:01[K     |██████████▋                     | 51kB 8.2MB/s eta 0:00:01[K     |████████████▉                   | 61kB 7.5MB/s eta 0:00:01[K     |███████████████                 | 71kB 8.4MB/s eta 0:00:01[K     |█████████████████               | 81kB 9.3MB/s eta 0:00:01[K     |███████████████████▏            | 92kB 9.8MB/s eta 0:00:01[K     |█████████████████████▎          | 102kB 7.9MB/s eta 0:00:01[K     |███████████████████████▌        | 112kB 7.9MB/s eta 0:00:01[K     |█████████████████████████▋      | 122kB 7.9MB/s e

# Load and inspect dataset

In [5]:
import datasets
import torch
from torch.utils.data import DataLoader, TensorDataset

def get_dataset(dataset):
    ds = datasets.load_dataset('glue', dataset)
    num_classes = ds['train'].features['label'].num_classes
    return ds, num_classes


def create_dataset_from_text_dataset(ds, tokenizer):
    encoding = tokenizer(ds['sentence'], return_tensors='pt', padding=True, truncation=True)
    input_ids = encoding['input_ids']
    attn_masks = encoding['attention_mask']
    labels = torch.tensor(ds['label'])
    return TensorDataset(input_ids, attn_masks, labels)


def get_tensor_datasets(dataset_dict, splits, tokenizer):
    split_datasets = {}
    for s in splits:
        split_datasets[s] = create_dataset_from_text_dataset(dataset_dict[s], tokenizer)
    return split_datasets


def get_data_loaders(split_datasets):
    train_loader = DataLoader(split_datasets['train'], batch_size=256, shuffle=True, num_workers=0)
    val_loader = DataLoader(split_datasets['validation'], batch_size=256, shuffle=False, num_workers=0)
    return train_loader, val_loader

In [6]:
dataset = 'sst2'
print(f'Loading {dataset} dataset...')
dataset_dict, num_classes = get_dataset(dataset)

Loading sst2 dataset...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=7826.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4473.0, style=ProgressStyle(description…


Downloading and preparing dataset glue/sst2 (download: 7.09 MiB, generated: 4.81 MiB, post-processed: Unknown size, total: 11.90 MiB) to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=7439277.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/sst2/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4. Subsequent calls will reuse this data.


In [7]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

# Load Tokenizer

In [8]:
import torch
import transformers
from transformers import AdapterType
from transformers import BertTokenizerFast, BertForSequenceClassification


def get_tokenizer(model_name):
    if model_name == 'bert-base-uncased':
        tokenizer = BertTokenizerFast.from_pretrained(model_name)
    else:
        raise NotImplementedError

    return tokenizer


def get_transformer(model_name, num_labels, adapter, dataset):
    if model_name == 'bert-base-uncased':
        model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        if adapter:
            model.add_adapter(dataset, AdapterType.text_task)
            model.train_adapter(dataset)
    else:
        raise NotImplementedError

    return model


def get_criterion(num_labels):
    if num_labels == 2:
        criterion = torch.nn.CrossEntropyLoss()
    else:
        raise NotImplementedError

    return criterion

In [9]:
model_name = 'bert-base-uncased'
print(f'Loading tokenizer for {model_name}...')
tokenizer = get_tokenizer(model_name)

Loading tokenizer for bert-base-uncased...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




# Create DataLoaders

In [10]:
splits = list(dataset_dict.keys())
print(f'Creating data loader for {splits} splits...')
split_datasets = get_tensor_datasets(dataset_dict, splits, tokenizer)
train_loader, val_loader = get_data_loaders(split_datasets)

Creating data loader for ['train', 'validation', 'test'] splits...


# Create model

In [11]:
adapter = False
print(f'Loading {model_name} with adapters={adapter}...')
model = get_transformer(model_name,
                        num_labels=num_classes,
                        adapter=adapter,
                        dataset=dataset)
criterion = get_criterion(num_labels=num_classes)

Loading bert-base-uncased with adapters=False...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Get Learning Scheme

In [12]:
import torch


def get_learning_scheme(learning_scheme, model):
    if learning_scheme == 'differential':
        optimizer_grouped_parameters = differential_learning_scheme(model)
        optimizer = torch.optim.SGD(optimizer_grouped_parameters)
    else:
        raise NotImplementedError

    return optimizer


def differential_learning_scheme(model, learning_rate=0.1, divisor=2.6):
    param_prefixes = {}
    for n, p in model.named_parameters():
        if p.requires_grad:
          base = n.partition('.weight')[0].partition('.bias')[0]
          if base not in param_prefixes:
              param_prefixes[base] = 0

    param_prefix_divisors = list(reversed([divisor * i for i in range(1, len(param_prefixes))])) + [1]
    param_learning_rates = [learning_rate / ld for ld in param_prefix_divisors]

    param_prefix_lr_lookup = dict(zip(param_prefixes.keys(), param_learning_rates))

    optimizer_grouped_parameters = [
        {'params': p, 'lr': param_prefix_lr_lookup[n.partition('.weight')[0].partition('.bias')[0]]}
        for n, p in model.named_parameters() if p.requires_grad
    ]

    return optimizer_grouped_parameters

In [13]:
learning_scheme = 'differential'
print(f'Configuring {learning_scheme} learning scheme...')
optimizer = get_learning_scheme(learning_scheme, model)

Configuring differential learning scheme...


# Train

In [14]:
import time
import torch
import torch.nn.functional as F

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
model = model.to(device)
n_epochs = 2
optimizer = optimizer
scheduler = None
criterion = criterion.to(device)

Using device: cuda


In [16]:
N_MINI_BATCH_CHECK = 10

In [17]:
def measure_performance(loader):
    running_loss = 0.0
    correct_count = 0.0
    total_count = 0.0
    for data in loader:
        input_ids = data[0].to(device)
        attn_masks = data[1].to(device)
        labels = data[2].to(device)
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attn_masks)[0]
            loss = criterion(outputs, labels)
            probas = F.softmax(outputs, dim=1)
            preds = torch.argmax(probas, axis=1)

            # Track stats
            running_loss += loss
            correct_count += torch.sum(preds == labels)
            total_count += len(labels)

    running_loss /= len(loader)
    acc = correct_count / total_count

    return running_loss, acc

In [18]:
if val_loader:
    print('Initial evaluating on validation dataset')
    val_loss, val_acc = measure_performance(val_loader)
    epoch_summary = f'[Epoch 0] | Val acc: {val_acc:.4f} Val loss: {val_loss:.4f}'
    print(epoch_summary)

Initial evaluating on validation dataset
[Epoch 0] | Val acc: 0.4908 Val loss: 0.7676


In [19]:
for epoch in range(n_epochs):
    print(f'--- Epoch: {epoch} ---')
    epoch_start_time = time.time()
    batch_start_time = time.time()
    running_loss = 0.0

    for i, data in enumerate(train_loader):
        input_ids = data[0].to(device)
        attn_masks = data[1].to(device)
        labels = data[2].to(device)

        optimizer.zero_grad()

        # Evaluation/optimization step
        outputs = model(input_ids=input_ids, attention_mask=attn_masks)[0]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()

        # Print statistics periodically
        running_loss += loss.item()
        if i % N_MINI_BATCH_CHECK == N_MINI_BATCH_CHECK - 1:
            batch_end_time = time.time()
            total_batch_time = batch_end_time - batch_start_time

            print(
                f'[E{epoch + 1:d} B{i + 1:d}] ',
                f'Loss: {running_loss / N_MINI_BATCH_CHECK:.5f} ',
                f'Time: {total_batch_time:.2f} ',
                f'LR: {scheduler.get_last_lr()}' if scheduler else '')

            # Reset statistics
            batch_start_time = time.time()
            running_loss = 0.0

    epoch_end_time = time.time()
    total_epoch_time = epoch_end_time - epoch_start_time
    epoch_summary = '[Epoch {}] {} seconds'.format((epoch + 1), total_epoch_time)

    if val_loader:
        val_loss, val_acc = measure_performance(val_loader)
        epoch_summary += f' | Val acc: {val_acc:.4f} | Val loss: {val_loss:.4f}'

    print(epoch_summary)

print('Finished training')

--- Epoch: 0 ---
[E1 B10]  Loss: 3.44072  Time: 25.28  
[E1 B20]  Loss: 0.67735  Time: 26.73  
[E1 B30]  Loss: 0.68136  Time: 28.36  
[E1 B40]  Loss: 0.67121  Time: 27.43  
[E1 B50]  Loss: 0.64553  Time: 27.41  
[E1 B60]  Loss: 0.65224  Time: 27.81  
[E1 B70]  Loss: 0.61834  Time: 27.62  
[E1 B80]  Loss: 0.58055  Time: 27.55  
[E1 B90]  Loss: 0.57255  Time: 27.63  
[E1 B100]  Loss: 0.50849  Time: 27.60  
[E1 B110]  Loss: 0.43454  Time: 27.64  
[E1 B120]  Loss: 0.40968  Time: 27.72  
[E1 B130]  Loss: 0.35291  Time: 27.64  
[E1 B140]  Loss: 0.33933  Time: 27.66  
[E1 B150]  Loss: 0.35922  Time: 27.70  
[E1 B160]  Loss: 0.33588  Time: 27.62  
[E1 B170]  Loss: 0.33323  Time: 27.60  
[E1 B180]  Loss: 0.31815  Time: 27.65  
[E1 B190]  Loss: 0.31162  Time: 27.62  
[E1 B200]  Loss: 0.32274  Time: 27.60  
[E1 B210]  Loss: 0.31899  Time: 27.58  
[E1 B220]  Loss: 0.31092  Time: 27.57  
[E1 B230]  Loss: 0.29866  Time: 27.58  
[E1 B240]  Loss: 0.32494  Time: 27.58  
[E1 B250]  Loss: 0.29867  Time: 