In [1]:
import torch
torch.cuda.is_available()

True

In [0]:
!wget -O "SST-2.zip" "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8"
!unzip "SST-2.zip"
# заменить username и password на свои 
!git clone https://username:password@github.com/yandexdataschool/lilbert.git
    
!pip install -r lilbert/requirements.txt
!mkdir lilbert/output

In [3]:
import sys
%load_ext autoreload
%autoreload 2
sys.path.append('lilbert/lilbert')

import numpy as np
import random
import torch

from pytorch_pretrained_bert.tokenization import BertTokenizer

from lib import data_processors, tasks
from lib.bert import BertForSequenceClassification
from lib.train_eval import train, evaluate, predict

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [4]:
params = {
    'data_dir': 'SST-2',
    'output_dir': '../output',
    'cache_dir': '../model_cache',
    'task_name': 'sst2',
    'bert_model': 'bert-base-uncased',
    'max_seq_length': 128,
    'train_batch_size': 32,
    'eval_batch_size': 8,
    'learning_rate': 2e-5,
    'warmup_proportion': 0.1,
    'num_train_epochs': 1,
    'seed': 1331,
    'device': torch.device(
        'cuda' if torch.cuda.is_available()
        else 'cpu')
}

random.seed(params['seed'])
np.random.seed(params['seed'])
torch.manual_seed(params['seed'])

<torch._C.Generator at 0x7fa1fba49390>

In [0]:
params['num_labels'] = tasks.num_labels[params['task_name']]
params['label_list'] = tasks.label_lists[params['task_name']]

processor = tasks.processors[params['task_name']]()
tokenizer = BertTokenizer.from_pretrained(
    params['bert_model'], do_lower_case=True)

train_examples = processor.get_train_examples(params['data_dir'])
dev_examples = processor.get_dev_examples(params['data_dir'])
model = BertForSequenceClassification.from_pretrained(
    params['bert_model'],
    cache_dir=params['cache_dir'],
    num_labels=params['num_labels']).to(params['device'])

## SVDLinear

In [0]:
import numpy as np
import math

from torch import nn
from torch.nn import init
from torch.nn import functional as F
from torch.nn.parameter import Parameter

In [0]:
class SVDLinear(nn.Module):
    def __init__(self, in_features, out_features, hidden_size, bias=True):
        super(SVDLinear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.hidden_size = hidden_size

        self.u = Parameter(torch.Tensor(out_features, self.hidden_size))
        self.s = Parameter(torch.Tensor(self.hidden_size))
        self.v = Parameter(torch.Tensor(self.hidden_size, in_features))

        if bias:
            self.bias = Parameter(torch.Tensor(out_features))
        else:
            self.register_parameter('bias', None)

        self.init_parameters()

    def init_weights(self, weight=None):
        if weight is None:
            init.kaiming_uniform_(self.u, a=math.sqrt(5))
            init.uniform_(self.s, a=math.sqrt(5))
            init.kaiming_uniform_(self.v, a=math.sqrt(5))
        else:
            u, s, v = np.linalg.svd(weight)
            del self.u, self.s, self.v
            self.u = Parameter(torch.Tensor(u[:, :self.hidden_size]))
            self.s = Parameter(torch.Tensor(s[:self.hidden_size]))
            self.v = Parameter(torch.Tensor(v[:self.hidden_size, :]))

    def init_parameters(self, weight=None):
        self.init_weights(weight)

        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(
                torch.mm(self.u, torch.mm(torch.diag(self.s), self.v))
            )
            bound = 1 / math.sqrt(fan_in)
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input):
        output = F.linear(input, self.v)
        output = F.linear(output, torch.diag(self.s))
        output = F.linear(output, self.u, self.bias)
        return output

In [0]:
def find_hidden_size(in_features, out_features, compression_rate):
    """
    Returns hidden size of matrices for known compression rate
    """
    return int(in_features * out_features /
               (compression_rate * (in_features + out_features + 1)))


def find_compression_rate(in_features, out_features, hidden_size):
    """
    Returns matrix compression rate for known hidden size
    """
    return in_features * out_features / \
           (hidden_size * (in_features + out_features + 1))


def linear_to_svd(linear_layer, hidden_size=None, compression_rate=None):
    """
    Returns SVDLinear layer for linear layer with hidden size
    equal to hidden_size parameter
    if hidden_size is None hidden_size is calculated by the compression_rate
    """
    if hidden_size is None and compression_rate is None:
        raise ValueError("At least one parameter (hidden_size or compression rate) should be not None")

    dense_weight = linear_layer.weight.cpu().data.numpy()
    in_features = linear_layer.in_features
    out_features = linear_layer.out_features

    if hidden_size is None:
        hidden_size = find_hidden_size(
            in_features, out_features, compression_rate
        )

    svd_linear = SVDLinear(in_features, out_features, hidden_size)
    svd_linear.init_weights(dense_weight)

    return svd_linear

## Example of changing all encoder linear layers

In [0]:
from tqdm import tqdm

In [0]:
def change_transformer_linears_to_svd(model, params, hidden_size=None, compression_rate=None):
    """
    Changes all linear layers to SVDLinear layers with hidden size
    equal to hidden_size parameter
    if hidden_size is None hidden_size for each layer is calculated by the compression_rate
    """
    if hidden_size is None and compression_rate is None:
        raise ValueError("At least one parameter (hidden_size or compression rate) should be not None")

    encoder_layers = list(model.children())[0].encoder.layer
    device = params['device']

    for bert_layer in tqdm(encoder_layers):
        attention = bert_layer.attention

        attention.self.query = linear_to_svd(
            attention.self.query,
            hidden_size,
            compression_rate
        ).to(device)

        attention.self.key = linear_to_svd(
            attention.self.key,
            hidden_size,
            compression_rate
        ).to(device)

        attention.self.value = linear_to_svd(
            attention.self.value,
            hidden_size,
            compression_rate
        ).to(device)

        attention.output.dense = linear_to_svd(
            attention.output.dense,
            hidden_size,
            compression_rate
        ).to(device)

        bert_layer.intermediate.dense = linear_to_svd(
            bert_layer.intermediate.dense,
            hidden_size,
            compression_rate
        ).to(device)

        bert_layer.output.dense = linear_to_svd(
            bert_layer.output.dense,
            hidden_size,
            compression_rate
        ).to(device)
    return model

In [0]:
from lib import size_utils

In [0]:
!wget -O "model.pt" "https://www.dropbox.com/s/2gclpuhipfovph2/model_baseline_from_parts.pt?dl=0"
model.load_state_dict(torch.load("model.pt"))

In [30]:
model_size = size_utils.get_model_size(model, params['cache_dir'])
print("Model initial size: {}".format(model_size))

Model initial size: 417.6935796737671 MB


In [31]:
model = change_transformer_linears_to_svd(model, params, compression_rate=2)

100%|██████████| 12/12 [01:36<00:00,  8.13s/it]


In [32]:
changed_model_size = size_utils.get_model_size(model, params['cache_dir'])
print("Size of model with svd linear layers: {}".format(changed_model_size))

Size of model with svd linear layers: 255.42891216278076 MB


In [0]:
result, prob_preds = evaluate(model, tokenizer, params,
                              dev_examples)
result

***** Running evaluation *****
Num examples:  872
Batch size:    8


Evaluating: 100%|██████████| 109/109 [00:12<00:00,  8.94it/s]


{'eval_accuracy': 0.805045871559633,
 'eval_f1_score': 0.8004694835680752,
 'eval_loss': 0.4252772675496056,
 'eval_matthews_corrcoef': 0.6126376550212791}

In [0]:
EPOCH_NUM = 1

params['num_train_epochs'] = 1
checkpoint_files = {
    'config': 'bert_config.json',
    'file_to_save': 'model_{}_epoch_{}.pth'.format(
        params['task_name'], EPOCH_NUM),
}

model, result = train(model, tokenizer, params,
                      train_examples,
                      valid_examples=dev2_examples,
                      checkpoint_files=checkpoint_files)

result

***** Running training *****
Num examples: 67349
Batch size:   32
Num steps:    2104


Iteration:   0%|          | 0/2105 [00:00<?, ?it/s]


Epoch: 1


Iteration: 100%|██████████| 2105/2105 [45:43<00:00,  1.20s/it]


{'train_loss': 0.1975332941656441, 'train_global_step': 2105}
***** Running evaluation *****
Num examples:  872
Batch size:    8


Evaluating: 100%|██████████| 109/109 [00:12<00:00,  9.04it/s]


{'eval_loss': 0.2370125255110239, 'eval_accuracy': 0.9162844036697247, 'eval_f1_score': 0.9180695847362514, 'eval_matthews_corrcoef': 0.8325109894262972}


{'train_global_step': 2105, 'train_loss': 0.1975332941656441}