In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
!python -V

Python 3.6.7


In [3]:
!ls

sample_data


In [None]:
!wget -O "SST-2.zip" "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8"
!unzip "SST-2.zip"
# заменить username и password на свои 
!git clone https://username:password@github.com/yandexdataschool/lilbert.git
    
!pip install -r lilbert/requirements.txt
!mkdir ./lilbert/output

In [5]:
import sys
%load_ext autoreload
%autoreload 2
sys.path.append('lilbert/lilbert')

import numpy as np
import random
import torch
from tqdm import tqdm
from pytorch_pretrained_bert.tokenization import BertTokenizer

from lib import data_processors, tasks
from lib.bert import BertForSequenceClassification
from lib.train_eval import train, evaluate, predict

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [6]:
%env CUDA_VISIBLE_DEVICES=1

params = {
    'data_dir': 'SST-2',
    'output_dir': '../output',
    'cache_dir': '../model_cache',
    'task_name': 'sst2',
    'bert_model': 'bert-base-uncased',
    'max_seq_length': 128,
    'train_batch_size': 32,
    'eval_batch_size': 8,
    'learning_rate': 2e-5,
    'warmup_proportion': 0.1,
    'num_train_epochs': 1,
    'seed': 1331,
    'device': torch.device(
        'cuda' if torch.cuda.is_available()
        else 'cpu')
}

random.seed(params['seed'])
np.random.seed(params['seed'])
torch.manual_seed(params['seed'])

env: CUDA_VISIBLE_DEVICES=1


<torch._C.Generator at 0x7f48b8ad1f90>

In [0]:
device = torch.device(
        'cuda' if torch.cuda.is_available()
        else 'cpu')

In [8]:
params['num_labels'] = tasks.num_labels[params['task_name']]
params['label_list'] = tasks.label_lists[params['task_name']]

processor = tasks.processors[params['task_name']]()
tokenizer = BertTokenizer.from_pretrained(
    params['bert_model'], do_lower_case=True)

train_examples = processor.get_train_examples(params['data_dir'])
dev_examples = processor.get_dev_examples(params['data_dir'])
model = BertForSequenceClassification.from_pretrained(
    params['bert_model'],
    cache_dir=params['cache_dir'],
    num_labels=params['num_labels']).to(params['device'])

100%|██████████| 231508/231508 [00:00<00:00, 413241.10B/s]
100%|██████████| 407873900/407873900 [00:31<00:00, 12818917.85B/s]


In [None]:
!wget -O "model.pt" "https://www.dropbox.com/s/2gclpuhipfovph2/model_baseline_from_parts.pt?dl=0"

In [0]:
model.load_state_dict(torch.load("model.pt"))

In [11]:
result, prob_preds = evaluate(model, tokenizer, params,
                              dev_examples)
result


***** Running evaluation *****
Num examples:  872
Batch size:    8


Evaluating: 100%|██████████| 109/109 [00:14<00:00,  7.63it/s]


{'eval_accuracy': 0.9254587155963303,
 'eval_f1_score': 0.9276974416017797,
 'eval_loss': 0.19265737580421163,
 'eval_matthews_corrcoef': 0.851069468488703}

In [0]:
from sklearn.cluster import KMeans
class Clusterize():
    def __init__(self, matrix, num_clusters=10):
      # reshape matrix
        self.matrix_shape = matrix.shape
        self.matrix = matrix.reshape((-1,1))
        matrix_len = len(self.matrix)
        # clustering
        cluster_algo = KMeans(n_clusters=num_clusters)
        cluster_algo.fit(self.matrix)
        self.clusters_labels = cluster_algo.labels_
        self.centers = cluster_algo.cluster_centers_
        # sort by vars
        self.clusters_size = [sum(self.clusters_labels == n_cluster) for n_cluster in range(num_clusters)]
        self.cluster_EQ = [self.matrix[np.argwhere(self.clusters_labels == n_cluster), :].var() * self.clusters_size[n_cluster]
            for n_cluster in range(num_clusters)]
      
    def get_clustered_matrix(self, num_clusters_to_quant=1):

        to_quant = np.argsort(self.cluster_EQ)[-num_clusters_to_quant:]
        # quantize
        mask = np.ones_like(self.clusters_labels)
        for n_cluster in to_quant:
            self.matrix[np.argwhere(self.clusters_labels == n_cluster), :] = self.centers[n_cluster][0]
            mask[np.argwhere(self.clusters_labels == n_cluster)] = 0
        return self.matrix.reshape(self.matrix_shape), mask.reshape(self.matrix_shape)

### Embedding quantization

In [0]:
class QuantLayerByteEmb(torch.nn.Module):
    def __init__(self, quant_matrix):
        super(QuantLayerByteEmb, self).__init__()
        self.values = torch.unique(quant_matrix)
        indx = [torch.nonzero(quant_matrix == value) for value in self.values]
        vals = [torch.ones(ind.shape[0]).to(device) * value for ind, value in zip(indx, self.values)]
        self.spmatrix = torch.sparse.FloatTensor(torch.cat(indx).transpose(dim0=1, dim1=0),
                                                 torch.cat(vals),
                                                 quant_matrix.shape)
    def forward(self, x):
        return torch.functional.F.embedding(x, self.spmatrix.to_dense())

In [0]:
clusters = Clusterize(model.bert.embeddings.word_embeddings.weight.cpu().data, 5)
quant_matrix, _ = clusters.get_clustered_matrix(5)
model.bert.embeddings.word_embeddings = QuantLayerByteEmb(quant_matrix.to(device))

### Linear layers quantization

In [0]:
class QuantLayerByte(torch.nn.Module):
    def __init__(self, quant_matrix, bias=None):
        super(QuantLayerByte, self).__init__()
        self.values = torch.unique(quant_matrix)
        self.bias = bias
        indx = [torch.nonzero(quant_matrix == value) for value in self.values]
        vals = [torch.ones(ind.shape[0]).to(device) * value for ind, value in zip(indx, self.values)]
        self.spmatrix = torch.sparse.FloatTensor(torch.cat(indx).transpose(dim0=1, dim1=0),
                                                 torch.cat(vals),
                                                 quant_matrix.shape)
    def forward(self, x):
        return torch.functional.F.linear(x, self.spmatrix.to_dense(), self.bias)

Quantization of the last 6 transformer layers:

In [16]:
for transformer_layer_ind in tqdm(range(6,12)):
    clusters = Clusterize(model.bert.encoder.layer[transformer_layer_ind].attention.self.query.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].attention.self.query.bias
    model.bert.encoder.layer[transformer_layer_ind].attention.self.query = QuantLayerByte(quant_matrix.to(device), bias)
    
    clusters = Clusterize(    model.bert.encoder.layer[transformer_layer_ind].attention.self.key.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].attention.self.key.bias
    model.bert.encoder.layer[transformer_layer_ind].attention.self.key = QuantLayerByte(quant_matrix.to(device), bias)
    
    
    clusters = Clusterize(model.bert.encoder.layer[transformer_layer_ind].attention.self.value.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].attention.self.value.bias
    model.bert.encoder.layer[transformer_layer_ind].attention.self.value = QuantLayerByte(quant_matrix.to(device), bias)
    
    
    clusters = Clusterize(model.bert.encoder.layer[transformer_layer_ind].attention.output.dense.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].attention.output.dense.bias
    model.bert.encoder.layer[transformer_layer_ind].attention.output.dense = QuantLayerByte(quant_matrix.to(device), bias)
    
    
    clusters = Clusterize(model.bert.encoder.layer[transformer_layer_ind].intermediate.dense.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].intermediate.dense.bias
    model.bert.encoder.layer[transformer_layer_ind].intermediate.dense = QuantLayerByte(quant_matrix.to(device), bias)
    
    
    clusters = Clusterize(model.bert.encoder.layer[transformer_layer_ind].output.dense.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].output.dense.bias
    model.bert.encoder.layer[transformer_layer_ind].output.dense = QuantLayerByte(quant_matrix.to(device), bias)

100%|██████████| 6/6 [23:47<00:00, 236.62s/it]


Two epoches training after quantization:

In [17]:
EPOCH_NUM = 1

params['num_train_epochs'] = 1
checkpoint_files = {
    'config': 'bert_config.json',
    'file_to_save': 'model_{}_epoch_{}.pth'.format(
        params['task_name'], EPOCH_NUM)
}

model, result = train(model, tokenizer, params,
                      train_examples,
                      valid_examples=dev_examples,
                      checkpoint_files=checkpoint_files)
result

***** Running training *****
Num examples: 67349
Batch size:   32
Num steps:    2104


Iteration:   0%|          | 0/2105 [00:00<?, ?it/s]


Epoch: 1


Iteration: 100%|██████████| 2105/2105 [58:20<00:00,  1.54s/it]


{'train_loss': 0.12293930361290725, 'train_global_step': 2105}
***** Running evaluation *****
Num examples:  872
Batch size:    8


Evaluating: 100%|██████████| 109/109 [00:52<00:00,  2.06it/s]


{'eval_loss': 0.2178882598109006, 'eval_accuracy': 0.9174311926605505, 'eval_f1_score': 0.9198218262806236, 'eval_matthews_corrcoef': 0.834957173451512}


{'train_global_step': 2105, 'train_loss': 0.12293930361290725}

In [18]:
EPOCH_NUM = 2

params['num_train_epochs'] = 1
checkpoint_files = {
    'config': 'bert_config.json',
    'file_to_save': 'model_{}_epoch_{}.pth'.format(
        params['task_name'], EPOCH_NUM)
}

model, result = train(model, tokenizer, params,
                      train_examples,
                      valid_examples=dev_examples,
                      checkpoint_files=checkpoint_files)
result

***** Running training *****
Num examples: 67349
Batch size:   32
Num steps:    2104


Iteration:   0%|          | 0/2105 [00:00<?, ?it/s]


Epoch: 1


Iteration: 100%|██████████| 2105/2105 [58:06<00:00,  1.53s/it]


{'train_loss': 0.05766299047248522, 'train_global_step': 2105}
***** Running evaluation *****
Num examples:  872
Batch size:    8


Evaluating: 100%|██████████| 109/109 [00:52<00:00,  2.06it/s]


{'eval_loss': 0.25082281055555034, 'eval_accuracy': 0.9185779816513762, 'eval_f1_score': 0.9211986681465039, 'eval_matthews_corrcoef': 0.8373848900015683}


{'train_global_step': 2105, 'train_loss': 0.05766299047248522}

Quantization of the first 6 layers of transformer:

In [19]:
for transformer_layer_ind in tqdm(range(6)):
  
    clusters = Clusterize(model.bert.encoder.layer[transformer_layer_ind].attention.self.query.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].attention.self.query.bias
    model.bert.encoder.layer[transformer_layer_ind].attention.self.query = QuantLayerByte(quant_matrix.to(device), bias)
    
    clusters = Clusterize(model.bert.encoder.layer[transformer_layer_ind].attention.self.key.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].attention.self.key.bias
    model.bert.encoder.layer[transformer_layer_ind].attention.self.key = QuantLayerByte(quant_matrix.to(device), bias)
    
    
    clusters = Clusterize(model.bert.encoder.layer[transformer_layer_ind].attention.self.value.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].attention.self.value.bias
    model.bert.encoder.layer[transformer_layer_ind].attention.self.value = QuantLayerByte(quant_matrix.to(device), bias)
    
    
    clusters = Clusterize(model.bert.encoder.layer[transformer_layer_ind].attention.output.dense.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].attention.output.dense.bias
    model.bert.encoder.layer[transformer_layer_ind].attention.output.dense = QuantLayerByte(quant_matrix.to(device), bias)
    
    
    clusters = Clusterize(model.bert.encoder.layer[transformer_layer_ind].intermediate.dense.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].intermediate.dense.bias
    model.bert.encoder.layer[transformer_layer_ind].intermediate.dense = QuantLayerByte(quant_matrix.to(device), bias)
    
    
    clusters = Clusterize(model.bert.encoder.layer[transformer_layer_ind].output.dense.weight.cpu().data, 5)
    quant_matrix, _ = clusters.get_clustered_matrix(5)
    bias = model.bert.encoder.layer[transformer_layer_ind].output.dense.bias
    model.bert.encoder.layer[transformer_layer_ind].output.dense = QuantLayerByte(quant_matrix.to(device), bias)   

100%|██████████| 6/6 [24:21<00:00, 243.80s/it]


### Final accuracy

In [20]:
result, prob_preds = evaluate(model, tokenizer, params,
                              dev_examples)
result

***** Running evaluation *****
Num examples:  872
Batch size:    8


Evaluating: 100%|██████████| 109/109 [01:18<00:00,  1.39it/s]


{'eval_accuracy': 0.9105504587155964,
 'eval_f1_score': 0.9129464285714285,
 'eval_loss': 0.24320432879693551,
 'eval_matthews_corrcoef': 0.8211187974949624}

### Compression rate

In [0]:
torch.save(model.state_dict(), 'model_quant_all.pt')

In [22]:
! ls -lS

total 439376
-rw-r--r-- 1 root root 437982975 Mar 29 23:03 model.pt
-rw-r--r-- 1 root root   7439277 May  2  2018 SST-2.zip
-rw-r--r-- 1 root root   4467526 Mar 30 02:04 model_quant_all.pt
drwxr-xr-x 6 root root      4096 Mar 29 23:02 lilbert
drwxr-xr-x 1 root root      4096 Mar 27 20:26 sample_data
drwxrwxr-x 3 root root      4096 May  2  2018 SST-2


In [1]:
print("Compression rate:", 437982975 / 4467526)

Compression rate: 98.03702877162885


In [0]:
from google.colab import files
files.download('model_quant_all.pt')